In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

#We may want these at some point for transforming our output:
#from scipy.special import logit, expit

pd.set_option('display.max_columns', None) #forces all columns to be displayed
pd.set_option('display.max_rows', None) #forces all rows to be displayed

In [8]:
filepath = r'../data/data-reduced-train.csv'
data = pd.read_csv(filepath)

%run ../data/features-grouped.ipynb
%run ../data/model-information.ipynb

features = data.columns[4:].to_list()
target = data.columns[3]

In [None]:
#set paramaters and other important presets
xgb_parameters = xgb_best_params_full   #from model-information.ipynb
n_neighbors = 10                        #neighbors for kNN imputation

# Create a dictionary of feature sets for models
feature_sets = {
    "Model 1 (All Features)": all_features,
    "Model 2 (Health Behaviors)": health_behaviors_features,
    "Model 3 (Access to Care)": access_to_care_features,
    "Model 4 (Socio-economic Factors)": socio_economic_features,
    "Model 5 (Physical Environment)": physical_environment_features,
    "Model 6 (Demographics)": demographics_features
}

percent_black_threshold = 50
percent_white_threshold = 50
percent_asian_threshold = 50
percent_hispanic_threshold = 50

In [4]:
#split data based on thresholds

high_asian_counties = data[data['% Asian'] >= percent_asian_threshold]
high_black_counties = data[data['% Black'] >= percent_black_threshold]
high_hispanic_counties = data[data['% Hispanic'] >= percent_hispanic_threshold]
high_white_counties = data[data['% Non-Hispanic White'] >= percent_white_threshold]

In [7]:
race_split_dict = {'High Asian Counties': high_asian_counties, 
                   'High Black Counties': high_black_counties, 
                   'High Hispanic Counties': high_hispanic_counties, 
                   'High White Counties': high_white_counties}

In [None]:
#run cross-validation OR a simple train/holdout split, depending on data size, on xgb and linear models

training_rmses = {}
val_rmses = {}

def model_error(model, df, features, target):
    model_preds = model.predict(df[features])
    return root_mean_squared_error(df[target], model_preds)

for key, df in race_split_dict:
    #split data into train/val sets
    df_train, df_val = train_test_split(df,test_size=0.2, random_state=42)

    #instantiate model(s)
    xgb_pipe = Pipeline([('impute', KNNImputer(n_neighbors)),
                     ('xgb', XGBRegressor(**xgb_parameters))])
    
    #fit model(s) to train set
    xgb_pipe.fit(df_train[features], df_train[target])

    #Compute and store training RMSEs
    training_rmses['Training RMSE on ' + key] = model_error(xgb_pipe, df_train, features, target)
    #Compute and store validation RMSEs
    val_rmses['Validation RMSE on ' + key] = model_error(xgb_pipe, )



#linear_pipe
#baseline models
#RFC?

In [None]:
#plot feature importances

In [None]:
#modeling on reduced subset of features? Perhaps in a new notebook, or the initial modeling notebook