In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import numpy as np
from typing import Dict, List, Tuple, Any, Optional
from pathlib import Path
import re
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc
)
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
pd.set_option('future.no_silent_downcasting', True)

from modelling import *

## Load data

In [None]:
# Load the data
data = {}
data[2020] = pd.read_csv('../data/processed/scalar/wave_1.csv')
data[2023] = pd.read_csv('../data/processed/scalar/wave_5.csv')

In [None]:
# Include the don't know answer in case of perceived_flood_frequency
include_dont_know = False

## Define independent and dependent variables

In [None]:
# Independent variables
y_vars = [
    'raise_ground_floor_level',
    'strengthen_foundations',
    'reinforce_walls_floor',
    'raise_electricity_meter',
    'install_anti_backflow_valves',
    'install_pump_drainage',
    'fix_water_barriers'
]

y_vars_ordered = {measure: i+1 for i, measure in enumerate(y_vars)}

# Dependent variables
X_vars = {
        'threat_appraisal': [
            'perceived_flood_frequency',
            'flood_worry_level',
            'experienced_flood'],
        'coping_appraisal': {
            'self_efficacy': [f'se{i+1}_{m}' for i, m in enumerate(y_vars_ordered)],
            'response_efficacy': [f're{i+1}_{m}' for i, m in enumerate(y_vars_ordered)],
            'perceived_costs': [f'pc{i+1}_{m}' for i, m in enumerate(y_vars_ordered)]},
        'adaptive_behavior': [f'adapt{i+1}_{m}_agg' for i, m in enumerate(y_vars_ordered)],
        'extra_vars': ['responsibility_perception']
    }


## Define mappings

In [None]:
ordinal_mappings = {
    'perceived_flood_frequency': {
        'My house is completely safe': 1,
        'Less often than 1 in 500 years': 2,
        'Once in 500 years': 3,
        'Once in 200 years': 4,
        'Once in 100 years': 5,
        'Once in 50 years': 6,
        'Once in 10 years': 7,
        'Annually': 8,
        'More frequent than once per year': 9,
        "Don't know": np.nan,
    },
    'flood_worry_level': {
        'Not at all worried': 1,
        'A little worried': 2,
        'Somewhat worried': 3,
        'Quite worried': 4,
        'Very worried': 5,
    },
    'rely_on_fam_friends': {
        'Strongly agree': 1,
        'Somewhat agree': 2,
        'Neither agree nor disagree': 3,
        'Somewhat disagree': 4,
        'Strongly disagree': 5,
    },
    'rely_on_gov': {
        'Strongly agree': 1,
        'Somewhat agree': 2,
        'Neither agree nor disagree': 3,
        'Somewhat disagree': 4,
        'Strongly disagree': 5,
    },
    'responsibility_perception': {
        'Completely government': 1,
        'Mostly government': 2,
        'Equal responsibility': 3,
        'Mostly individual': 4,
        'Completely individual': 5,
    }
}

if include_dont_know:
    nominal_cols = ['experienced_flood', 'perceived_flood_frequency_dont_know']
else:
    nominal_cols = ['experienced_flood']

y_mappings = {'Will not implement': 0, 'Will implement': 1, 'Already implemented': 2}

## Compile variables and mappings into config

In [None]:
config = {
    'variables': X_vars,
    'structural_measures': y_vars_ordered,
    'ordinal_mappings': ordinal_mappings,
    'include_dont_know': include_dont_know,
    'nominal_cols': nominal_cols,
    'y_mappings': y_mappings,
    'vif_threshold': 5.0
}

## Correlation analysis

### Wave-2020

In [None]:
wave_year = 2020
X_base = prepare_features(data[wave_year], config, verbose=False)
sm_names = {'raise_ground_floor_level': 'Raise ground floor level',
            'strengthen_foundations': 'Strengthen foundations',
            'reinforce_walls_floor': 'Reinforce walls and floor',
            'raise_electricity_meter': 'Raise electricity meter',
            'install_anti_backflow_valves': 'Install anti-backflow valves',
            'install_pump_drainage': 'Install pump drainage',
            'fix_water_barriers': 'Fix water barriers'}

for sm_measure in y_vars_ordered:
    X_sm = add_structural_measure_features(X_base, data[wave_year], sm_measure, y_vars_ordered[sm_measure], include_all=True)
    corr_matrix = X_sm.corr(method='spearman')
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    predictor_names = {'perceived_flood_frequency': 'Perceived flood frequency',
                       'flood_worry_level': 'Flood worry level',
                       'experienced_flood_Yes': 'Experienced flood',
                       'responsibility_perception': 'Responsibility perception',
                       f'se{y_vars_ordered[sm_measure]}_{sm_measure}': 'Self-efficacy',
                       f're{y_vars_ordered[sm_measure]}_{sm_measure}': 'Response-efficacy',
                       f'pc{y_vars_ordered[sm_measure]}_{sm_measure}': 'Perceived costs'}
    
    corr_matrix.rename(columns=predictor_names, index=predictor_names, inplace=True)
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', vmin=-1, vmax=1, center=0, 
                square=True, linewidths=0.5, annot=True,
                )
    plt.title(f'{sm_names[sm_measure]}')
    plt.tight_layout()
    plt.show()

### Wave-2023

In [None]:
wave_year = 2023
X_base = prepare_features(data[wave_year], config, verbose=False)
sm_names = {'raise_ground_floor_level': 'Raise ground floor level',
            'strengthen_foundations': 'Strengthen foundations',
            'reinforce_walls_floor': 'Reinforce walls and floor',
            'raise_electricity_meter': 'Raise electricity meter',
            'install_anti_backflow_valves': 'Install anti-backflow valves',
            'install_pump_drainage': 'Install pump drainage',
            'fix_water_barriers': 'Fix water barriers'}

for sm_measure in y_vars_ordered:
    X_sm = add_structural_measure_features(
        X_base, data[wave_year], sm_measure, y_vars_ordered[sm_measure], include_all=True)
    corr_matrix = X_sm.corr(method='spearman')
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    predictor_names = {'perceived_flood_frequency': 'Perceived flood frequency',
                       'flood_worry_level': 'Flood worry level',
                       'experienced_flood_Yes': 'Experienced flood',
                       'responsibility_perception': 'Responsibility perception',
                       f'se{y_vars_ordered[sm_measure]}_{sm_measure}': 'Self-efficacy',
                       f're{y_vars_ordered[sm_measure]}_{sm_measure}': 'Response-efficacy',
                       f'pc{y_vars_ordered[sm_measure]}_{sm_measure}': 'Perceived costs'}

    corr_matrix.rename(columns=predictor_names,
                       index=predictor_names, inplace=True)
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', vmin=-1, vmax=1, center=0,
                square=True, linewidths=0.5, annot=True,
                )
    plt.title(f'{sm_names[sm_measure]}')
    plt.tight_layout()
    plt.show()

## Variance Inflation Factor

### Wave-2020

In [None]:
wave_year = 2020
X_base = prepare_features(data[wave_year], config, verbose=False)
sm_names = {'raise_ground_floor_level': 'Raise ground floor level',
            'strengthen_foundations': 'Strengthen foundations',
            'reinforce_walls_floor': 'Reinforce walls and floor',
            'raise_electricity_meter': 'Raise electricity meter',
            'install_anti_backflow_valves': 'Install anti-backflow valves',
            'install_pump_drainage': 'Install pump drainage',
            'fix_water_barriers': 'Fix water barriers'}

for sm_measure in y_vars_ordered:
    X_sm = add_structural_measure_features(
        X_base, data[wave_year], sm_measure, y_vars_ordered[sm_measure], include_all=True)
    # Calculate VIF
    vif_data = calculate_vif(X_sm, config['vif_threshold'])
    vif_data.rename(columns={'feature': 'Variable'}, inplace=True)

    predictor_names = {'perceived_flood_frequency': 'Perceived flood frequency',
                       'flood_worry_level': 'Flood worry level',
                       'experienced_flood_Yes': 'Experienced flood',
                       'responsibility_perception': 'Responsibility perception',
                       f'se{y_vars_ordered[sm_measure]}_{sm_measure}': 'Self-efficacy',
                       f're{y_vars_ordered[sm_measure]}_{sm_measure}': 'Response-efficacy',
                       f'pc{y_vars_ordered[sm_measure]}_{sm_measure}': 'Perceived costs'}

    vif_data['Variable'] = vif_data['Variable'].replace(predictor_names)

    # Sort by VIF
    vif_data = vif_data.sort_values(by='VIF', ascending=False)

    plt.figure(figsize=(6, 4))
    sns.barplot(y=vif_data['Variable'], x=vif_data['VIF'])
    plt.title(f'{sm_names[sm_measure]}')
    plt.xlabel('VIF')
    plt.ylabel('Variables')

    # Add dashed red line at VIF threshold
    plt.axvline(config['vif_threshold'], color='red', linestyle='--')

    sns.despine()

### Wave-2023

In [None]:
wave_year = 2023
X_base = prepare_features(data[wave_year], config, verbose=False)
sm_names = {'raise_ground_floor_level': 'Raise ground floor level',
            'strengthen_foundations': 'Strengthen foundations',
            'reinforce_walls_floor': 'Reinforce walls and floor',
            'raise_electricity_meter': 'Raise electricity meter',
            'install_anti_backflow_valves': 'Install anti-backflow valves',
            'install_pump_drainage': 'Install pump drainage',
            'fix_water_barriers': 'Fix water barriers'}

for sm_measure in y_vars_ordered:
    X_sm = add_structural_measure_features(
        X_base, data[wave_year], sm_measure, y_vars_ordered[sm_measure], include_all=True)
    # Calculate VIF
    vif_data = calculate_vif(X_sm, config['vif_threshold'])
    vif_data.rename(columns={'feature': 'Variable'}, inplace=True)

    predictor_names = {'perceived_flood_frequency': 'Perceived flood frequency',
                       'flood_worry_level': 'Flood worry level',
                       'experienced_flood_Yes': 'Experienced flood',
                       'responsibility_perception': 'Responsibility perception',
                       f'se{y_vars_ordered[sm_measure]}_{sm_measure}': 'Self-efficacy',
                       f're{y_vars_ordered[sm_measure]}_{sm_measure}': 'Response-efficacy',
                       f'pc{y_vars_ordered[sm_measure]}_{sm_measure}': 'Perceived costs'}

    vif_data['Variable'] = vif_data['Variable'].replace(predictor_names)

    # Sort by VIF
    vif_data = vif_data.sort_values(by='VIF', ascending=False)

    plt.figure(figsize=(6, 4))
    sns.barplot(y=vif_data['Variable'], x=vif_data['VIF'])
    plt.title(f'{sm_names[sm_measure]}')
    plt.xlabel('VIF')
    plt.ylabel('Variables')

    # Add dashed red line at VIF threshold
    plt.axvline(config['vif_threshold'], color='red', linestyle='--')

    sns.despine()