In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import train_test_split
import utils

1. create dfs for each state with all relevant metrics
2. create index for each df
3. compare indexs

# Finding relationships between hpi and unemployment insurance

In [2]:
#percap unemployment insurance
ic = pd.read_csv('./data/initial_claims_by_state.csv', parse_dates = ['index'], index_col = 'index')
# cc = pd.read_csv('./data/percap_continued_claims.csv', parse_dates = ['reflecting_week_ended'], index_col = 'reflecting_week_ended')
# iur = pd.read_csv('./data/insured_unemployment_rate.csv', parse_dates = ['reflecting_week_ended'], index_col = 'reflecting_week_ended')

#hpi index
hpi = pd.read_csv('./data/hpi_state.csv', parse_dates = ['time'], index_col = 'time')

# #percap income
# income = pd.read_csv('./data/personal_income_by_state.csv')

#nominal gdp
n_gdp = pd.read_csv('./data/nominal_gdp_by_state.csv', parse_dates = ['index'], index_col = 'index')


In [3]:
states = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'district_of_columbia', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 
              'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new_hampshire', 'new_jersey', 'new_mexico', 'new_york', 'north_carolina', 'north_dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode_island', 'south_carolina', 'south_dakota', 'tennessee', 'texas', 'utah', 
              'vermont', 'virginia', 'washington', 'west_virginia', 'wisconsin', 'wyoming']

# Steps to make indices dataframe:
1. ensure df only has states (only_states())
2. apply suffixes of relevant metric (add_suffix())
3. align intervals (intervals()) of all metric dfs
4. concatenate metric dfs (align_dfs())
5. apply indices (state_indices)

### 1.

In [4]:
def only_states(df:pd.DataFrame):
    
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    states = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'district_of_columbia', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 
              'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new_hampshire', 'new_jersey', 'new_mexico', 'new_york', 'north_carolina', 'north_dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode_island', 'south_carolina', 'south_dakota', 'tennessee', 'texas', 'utah', 
              'vermont', 'virginia', 'washington', 'west_virginia', 'wisconsin', 'wyoming']
    for x in df.columns:
        if x not in states:
            df = df.drop(columns = [x])
            
    return df

In [5]:
hpi = only_states(hpi)
ic = only_states(ic)
n_gdp = only_states(n_gdp)

### 2.

In [6]:
#applying metric suffix to state columns 
#this will avoid overlap with metric columns
def add_suffix(df, metric_name = str):
    df = df.add_suffix(f'_{metric_name}')
    return df

In [7]:
hpi = add_suffix(hpi, 'hpi')
ic = add_suffix(ic, 'ic')
n_gdp = add_suffix(n_gdp, 'gdp')

### 3.

In [8]:
#choosing the index interval
def set_intervals(df, interval = str):
    df = df.resample(interval).ffill().interpolate('linear').dropna()
    return df

In [9]:
#iterate through metric dfs to apply the same intervals


hpi = set_intervals(hpi, 'M')
ic = set_intervals(ic, 'M')
n_gdp = set_intervals(n_gdp, 'M')


### 4.

In [10]:
#concatenating the dataframes by axis = 1
def align_dfs(dfs):
    new_df = []
    
    for df in dfs:
        new_df.append(df)
        pd.concat(new_df,axis = 1).dropna(axis = 0)
    
    return pd.concat(new_df, axis = 1).dropna(axis = 0)

In [11]:
#nulls are expected, don't worry
metrics_df = align_dfs([hpi,ic, n_gdp])

In [12]:
metrics_df.head()

Unnamed: 0,alabama_hpi,alaska_hpi,arizona_hpi,arkansas_hpi,california_hpi,colorado_hpi,connecticut_hpi,delaware_hpi,district_of_columbia_hpi,florida_hpi,...,south_dakota_gdp,tennessee_gdp,texas_gdp,utah_gdp,vermont_gdp,virginia_gdp,washington_gdp,west_virginia_gdp,wisconsin_gdp,wyoming_gdp
2010-01-31,101.43,89.961,84.858,95.728,78.345,80.626,105.04,103.837,75.459,86.038,...,12231.033333,85084.633333,407754.533333,38794.5,8918.266667,139568.9,118757.566667,21523.433333,83644.333333,12479.833333
2010-02-28,101.43,89.961,84.858,95.728,78.345,80.626,105.04,103.837,75.459,86.038,...,12231.033333,85084.633333,407754.533333,38794.5,8918.266667,139568.9,118757.566667,21523.433333,83644.333333,12479.833333
2010-03-31,99.937,90.753,81.897,96.025,78.116,79.687,103.88,102.746,75.779,83.65,...,12231.033333,85084.633333,407754.533333,38794.5,8918.266667,139568.9,118757.566667,21523.433333,83644.333333,12479.833333
2010-04-30,99.937,90.753,81.897,96.025,78.116,79.687,103.88,102.746,75.779,83.65,...,12375.966667,85594.3,410672.8,38939.9,9108.6,142154.2,121620.733333,21761.166667,84872.3,12416.0
2010-05-31,99.937,90.753,81.897,96.025,78.116,79.687,103.88,102.746,75.779,83.65,...,12375.966667,85594.3,410672.8,38939.9,9108.6,142154.2,121620.733333,21761.166667,84872.3,12416.0


### 5.

In [13]:
def state_indices(df):
    df_dict = {}
    df_list = []

    states = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'district_of_columbia', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 
              'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new_hampshire', 'new_jersey', 'new_mexico', 'new_york', 'north_carolina', 'north_dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode_island', 'south_carolina', 'south_dakota', 'tennessee', 'texas', 'utah', 
              'vermont', 'virginia', 'washington', 'west_virginia', 'wisconsin', 'wyoming']
    
    for state in states:
        df_list.append(df.filter(regex=f'{state}', axis = 1))
    
    for i in range(len(states)):
        df_dict[states[i]] = df_list[i]

    return df_dict

In [14]:
df_dict = state_indices(metrics_df)

In [15]:
for state, df in df_dict.items():
    df_dict[state] = utils.create_indices(df)

In [16]:
# Removing west virginia and arkansas from virginia and kansas dataframes
df_dict['virginia'] = df_dict['virginia'][df_dict['virginia'].columns.drop(list(df_dict['virginia'].filter(regex='west_virginia')))]
df_dict['kansas'] = df_dict['kansas'][df_dict['kansas'].columns.drop(list(df_dict['kansas'].filter(regex='arkansas')))]

# VAR Model Workflow

In [17]:
# Code written by Joseph Nelson
def interpret_dftest(dftest):
    dfoutput = pd.Series(dftest[0:2], index=['Test Statistic', 'p-value'])
    return dfoutput

# achieve stationarity for all columns
def stationarity(df):
    for col in df:
        while interpret_dftest(adfuller(df[col].dropna()))[1] > 0.01:
            df[col] = df[col].diff()
    
    return df

In [18]:
for state, df in df_dict.items():
    df_dict[state] = stationarity(df).fillna(0)

In [19]:
forecast_df = pd.DataFrame(columns=['state', 'forecast', 'baseline_mse', 'mse'])

for state, df in df_dict.items():

    train, test = train_test_split(
    df,
    test_size=0.2,
    shuffle=False)

    model = VAR(train)
    ts_model = model.fit(maxlags=1, ic='aic')  

    # If forecast is positive, then VAR model predicts index increase in the next year. 
    # If forecase is negative, then VAR model predicts index decrease in the next year.

    # MSE calculation - VAR Model
    forecast = ts_model.forecast(train.values, len(test))
    sse_index = 0
    for time in range(len(test)):
        sse_index += (forecast[time][0] - test.values[time][0]) ** 2
    mse_index = (sse_index / len(test))

    # MSE calculation - Baseline (12 month rolling average)
    baseline_index = df_dict[state]['made_index'].rolling(12).sum().fillna(0)
    baseline_sse = np.sum((baseline_index - df['made_index']) ** 2)
    baseline_mse = (baseline_sse / len(df['made_index']))

    # Add values to df
    forecast_df.loc[state] = [state, ts_model.forecast(test.values, 12)[11][3], round(baseline_mse, 2), round(mse_index, 2)] # rounding to prevent scientific notation, readability
    # Forecasting 12 months ahead as the dataframe's time intervals are monthly

forecast_df = forecast_df.reset_index().drop(columns=['index'])

In [20]:
forecast_df.loc[(forecast_df['forecast'] >= 0.05), 'forecast_categorical'] = 1
forecast_df.loc[(forecast_df['forecast'] <= -0.05), 'forecast_categorical'] = -1
forecast_df.loc[(forecast_df['forecast'] <= 0.05) & (forecast_df['forecast'] >= -0.05), 'forecast_categorical'] = 0

In [21]:
forecast_df

Unnamed: 0,state,forecast,baseline_mse,mse,forecast_categorical
0,alabama,0.894712,150.61,9.49,1.0
1,alaska,0.548023,732.89,3.95,1.0
2,arizona,0.003244,19.9,101.1,0.0
3,arkansas,0.88344,229.31,24.85,1.0
4,california,0.783495,103.23,34.62,1.0
5,colorado,-0.176219,113.67,54.56,-1.0
6,connecticut,0.682249,310.96,7.82,1.0
7,delaware,-0.059896,154.64,7.49,-1.0
8,district_of_columbia,-0.508281,92.08,4.01,-1.0
9,florida,-0.272692,183.38,68.39,-1.0


In [22]:
print('Baseline MSE', forecast_df['baseline_mse'].mean())
print('Model MSE', forecast_df['mse'].mean())

Baseline MSE 44529.85196078433
Model MSE 26.319215686274514
