In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

#We may want these at some point for transforming our output:
#from scipy.special import logit, expit

pd.set_option('display.max_columns', None) #forces all columns to be displayed
pd.set_option('display.max_rows', None) #forces all rows to be displayed

In [2]:
filepath = r'../data/data_reduced_train.csv'
data = pd.read_csv(filepath)

%run ../data/features_grouped.ipynb

In [6]:
data.head()

Unnamed: 0,FIPS,State,County,% Adults with Diabetes,% Adults Reporting Currently Smoking,% Adults with Obesity,Food Environment Index,% Physically Inactive,% With Access to Exercise Opportunities,% Excessive Drinking,% Driving Deaths with Alcohol Involvement,% Uninsured,Dentist Rate,% with Annual Mammogram,% Vaccinated,% Completed High School,% Unemployed,% Children in Poverty,Income Ratio,% Children in Single-Parent Households,Social Association Rate,Average Daily PM2.5,% Severe Housing Problems,% Drive Alone to Work,% Long Commute - Drives Alone,% Food Insecure,% Limited Access to Healthy Foods,% Insufficient Sleep,% Uninsured Children,Other Primary Care Provider Rate,School Funding Adequacy,Gender Pay Gap,Median Household Income,% Household Income Required for Child Care Expenses,% Voter Turnout,% Census Participation,Traffic Volume,% Homeowners,% Households with Severe Cost Burden,% Households with Broadband Access,Population,% Less than 18 Years of Age,% 65 and Over,% Black,% American Indian or Alaska Native,% Asian,% Native Hawaiian or Other Pacific Islander,% Hispanic,% Non-Hispanic White,% Not Proficient in English,% Female,% Rural
0,17027,Illinois,Clinton,8.7,16.7,34.8,9.0,25.8,63.809317,18.658612,14.285714,6.070879,32.5124,48.0,43.0,92.001064,2.951476,9.2,3.919867,16.250475,13.04596,8.8,9.181226,79.892449,39.5,7.0,3.741,32.7,3.365561,86.69972,-3699.384446,0.735249,79612.0,21.156358,65.471311,73.9,14.192289,79.555648,7.830308,87.890706,36909.0,21.236013,18.662115,3.243111,0.403696,0.655667,0.08399,3.541142,91.007613,0.535157,47.787802,80.216266
1,42071,Pennsylvania,Lancaster,8.2,16.7,35.2,8.8,23.4,80.948635,17.168046,22.709163,11.306447,59.1058,53.0,60.0,86.957844,3.524487,11.9,3.779021,15.501977,13.799282,11.1,14.051126,74.112554,29.9,7.4,4.853843,36.8,11.306698,114.79819,5941.193063,0.755715,82434.0,30.232671,69.184565,75.9,104.296025,69.948992,11.788695,85.387678,556629.0,22.864601,19.735587,3.722587,0.474823,2.778152,0.114798,11.589407,80.19363,2.150077,50.747266,27.87549
2,46003,South Dakota,Aurora,8.2,17.0,38.8,7.8,23.1,3.349108,21.089477,60.0,13.614801,0.0,52.0,31.0,91.299304,1.877934,14.7,3.314294,15.384615,10.917031,4.7,5.333333,75.939306,25.5,6.0,16.918054,29.6,11.163522,145.19056,-1155.78,0.783785,61013.0,20.880796,73.560209,55.1,5.316834,76.638177,7.763023,78.252612,2755.0,24.6098,21.125227,0.907441,2.903811,0.943739,0.0,8.45735,85.880218,1.615576,48.566243,100.0
3,46027,South Dakota,Clay,9.2,16.3,35.6,7.6,22.2,85.56825,22.062377,28.571429,13.246101,58.90052,50.0,60.0,93.4006,2.141792,17.1,5.0,24.854142,13.861386,5.8,19.581749,71.909959,24.3,9.8,11.06981,28.4,7.773338,137.43455,-629.3447,0.743883,55263.0,27.781698,50.536585,65.7,86.059956,51.089365,16.62444,89.844904,15280.0,16.950262,12.624346,1.760471,4.01178,2.729058,0.065445,3.331152,86.302356,0.406533,50.425393,22.101958
4,13205,Georgia,Mitchell,15.9,22.8,42.2,6.8,34.5,59.91726,13.354151,16.666667,17.431074,14.20724,50.0,36.0,83.451327,3.968447,32.8,6.162685,53.611394,9.757911,9.7,17.456359,84.045812,39.4,15.0,7.944094,41.0,6.642538,61.56469,-17591.665,0.691094,45296.0,25.927234,53.574417,51.7,20.260255,63.60424,15.178571,77.801615,21116.0,22.589506,18.407842,45.87043,0.672476,0.975564,0.0663,5.479257,46.12616,0.725704,49.720591,75.775684


In [7]:
#set paramaters and other important presets
xgb_parameters = {}
n_neighbors = 10    #neighbors for kNN imputation
percent_black_threshold = 50
percent_white_threshold = 50
percent_asian_threshold = 50
percent_hispanic_threshold = 50

In [None]:
#split data based on thresholds

high_asian_counties = data[data['% Asian'] >= percent_asian_threshold]
high_black_counties = data[data['% Black'] >= percent_black_threshold]
high_hispanic_counties = data[data['% Hispanic'] >= percent_hispanic_threshold]
high_white_counties = data[data['% Non-Hispanic White'] >= percent_white_threshold]

In [10]:
len(high_white_counties)

2196

In [None]:
#run cross-validation OR a simple train/holdout split, depending on data size, on xgb and linear models, with parameters from Shravan's notebook

xgb_pipe = Pipeline([('impute', KNNImputer(n_neighbors)),
                     ('xgb', XGBRegressor(**xgb_parameters))])

#linear_pipe
#baseline models
#RFC?

In [None]:
#plot feature importances

In [None]:
#modeling on reduced subset of features? Perhaps in a new notebook, or the initial modeling notebook