In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

#We may want these at some point for transforming our output:
#from scipy.special import logit, expit

pd.set_option('display.max_columns', None) #forces all columns to be displayed
pd.set_option('display.max_rows', None) #forces all rows to be displayed

In [None]:
filepath = r'../data/data-reduced-train.csv'
data = pd.read_csv(filepath)

In [61]:
#split data based on thresholds
percent_asian_threshold = 5
percent_black_threshold = 10
percent_hispanic_threshold = 10
percent_white_threshold = 50

high_asian_counties = data[data['% Asian'] >= percent_asian_threshold]
high_black_counties = data[data['% Black'] >= percent_black_threshold]
high_hispanic_counties = data[data['% Hispanic'] >= percent_hispanic_threshold]
high_white_counties = data[data['% Non-Hispanic White'] >= percent_white_threshold]

race_splits = {#'High Asian Counties': high_asian_counties,
               'High Black Counties': high_black_counties,
               'High Hispanic Counties': high_hispanic_counties,
               'High White Counties': high_white_counties}

In [22]:
%run ../data/features-grouped.ipynb
%run ../data/model-information.ipynb

In [24]:
health_behaviors_features = extract_feature_names_by_category(data, 'Health Behaviors')
access_to_care_features = extract_feature_names_by_category(data, 'Access to Care')
socio_economic_features = extract_feature_names_by_category(data, 'Socio-economic Factors')
physical_environment_features = extract_feature_names_by_category(data, 'Physical Environment')
demographics_features = extract_feature_names_by_category(data, 'Demographics')

all_features = data.columns[4:].to_list()
target = data.columns[3] #'% Adults with Diabetes'

In [28]:
models = {
    "Model 1 (All Features)": {'Features': all_features, 'Parameters': xgb_best_params_full},
    "Model 2 (Health Behaviors)": {'Features': health_behaviors_features, 'Parameters': xgb_best_params_health},
    "Model 3 (Access to Care)": {'Features': access_to_care_features, 'Parameters': xgb_best_params_care},
    "Model 4 (Socio-economic Factors)": {'Features': socio_economic_features, 'Parameters': xgb_best_params_socioecon},
    "Model 5 (Physical Environment)": {'Features': physical_environment_features, 'Parameters': xgb_best_params_environment},
    "Model 6 (Demographics)": {'Features': demographics_features, 'Parameters': xgb_best_params_demographic}
}

In [None]:
#run cross-validation OR a simple train/holdout split, depending on data size, on xgb and linear models

n_neighbors = 10 #neighbors for kNN imputation

def model_error(model, df, features, target):
    model_preds = model.predict(df[features])
    return root_mean_squared_error(df[target], model_preds)

results = {}
models = {}

for race_split, df in race_splits.items():
    #split data into train/val sets
    df_train, df_val = train_test_split(df,test_size=0.1, random_state=42)

    for model_name, model_info in models.items():
        features = model_info['Features']         #list of features for feature subset model
        parameters = model_info['Parameters']     #dict of best parameters for the model

        #instantiate model(s)
        xgb_pipe = Pipeline([('impute', KNNImputer(n_neighbors=n_neighbors)),
                     ('xgb', XGBRegressor(**parameters))])
    
        #fit model(s) to train set
        #xgb_pipe.fit(df_train[features], df_train[target])

        #Compute and store training RMSEs
        #training_rmses['Training RMSE on ' + key] = model_error(xgb_pipe, df_train, features, target)
        #Compute and store validation RMSEs
        #val_rmses['Validation RMSE on ' + key] = model_error(xgb_pipe, )



#linear_pipe
#baseline models
#RFC?

In [None]:
#plot feature importances