## State Modeling

### The goal of this workbook is to perform xgboost modeling on the individual state level to determine the strongest coefficients on the state level and prepare the data for visualization.

### Import modules

In [104]:
import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Note: the original dataframe was pickled from Project_3_Data_Cleaning_Weights.ipynb
with open('df_M_state.pkl', 'rb') as f:
    df = pickle.load(f)

### Scale the data

In [110]:
def scaler(X_list):
    # Scale all X_value
    std = StandardScaler()
    
    X_scale = []

    for df in X_list:
        df_sc = df.copy()
        df_scale_cols = df.drop(['PERWT'], axis = 1)
        scaler = std.fit_transform(df_scale_cols.values)
        df_sc.loc[:,list(df_scale_cols.columns)] = scaler
        X_scale.append(df_sc)

    X_tr_sc = X_scale[0]
    X_val_sc = X_scale[1]
    X_te_sc = X_scale[2]

    X_val_sc = X_val_sc.drop(['index', 'PERWT'], axis = 1)
    X_te_sc = X_te_sc.drop(['index', 'PERWT'], axis = 1)
    
    return [X_tr_sc, X_val_sc, X_te_sc,]

### Perform xgboost on each state within the dataframe

In [111]:
def xgb_model(df):
    # Assign state list to loop through
    state_list = list(df['STATEFIP'].unique())
    
    for state in state_list:
        print(state)
        
        # Create dataframe for specific state
        df_state=df[df['STATEFIP'] == state].copy()
        
        # Remove unnecessary columns, generate target
        X = df_state.drop(['UNIQUE_ID', 'MIGRATE1', 'POVERTY_BIN', 'BPL', 'EDUC', 'MARST', 'STATEFIP'], axis = 1)
        y = df_state[['POVERTY_BIN']]
        
        # train_test_split data
        X_init, X_te, y_init, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        X_tr, X_val, y_tr, y_val = train_test_split(X_init, y_init, test_size=0.25, random_state=42, \
                                                    stratify=y_init)
        # Assign weight columns
        WT_tr = X_tr['PERWT'].copy()
        WT_val = X_val['PERWT'].copy()
        
        # Scale X data
        X_list = [X_tr, X_val, X_te]
        X_data = scaler(X_list)
        
        # xgbClassifier, xgboost modelling
        xgb = XGBClassifier(learning_rate = 0.03, random_state=42)
        xgb.fit(X_data[0].drop(['index','PERWT'], axis = 1), y_tr.values.ravel(), \
                sample_weight = WT_tr.values.ravel())
        
        # Output pickle files for each state
        with open(state + 'xgb.pkl', 'wb') as f:
            pickle.dump(xgb, f)

In [112]:
# Used to prevent issues with xgboost installation.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [113]:
# Run the modeling for all states. Output is 51 pickle files.
xgb_model(df)

AL




AK




AZ




AR




CA




CO




CT




DE




DC




FL




GA




HI




ID




IL




IN




IA




KS




KY




LA




ME




MD




MA




MI




MN




MS




MO




MT




NE




NV




NH




NJ




NM




NY




NC




ND




OH




OK




OR




PA




RI




SC




SD




TN




TX




UT




VT




VA




WA




WV




WI




WY


