In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

#We may want these at some point for transforming our output:
#from scipy.special import logit, expit

In [3]:
%run ../data/features-grouped.ipynb
%run ../data/model-information.ipynb

In [7]:
filepath = '../data/data-reduced-train.csv'
data = pd.read_csv(filepath)

In [8]:
#n_neighbors = 10 #number of neighbors for kNN imputation
features = data.columns[4:]
target = '% Adults with Diabetes'

In [15]:
xgb_pipe = Pipeline([('impute', KNNImputer()),
                     ('xgb', XGBRegressor())])

param_grid = {
    'impute__n_neighbors': [5, 10, 15],
    'xgb__n_estimators': [50, 100, 150],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': np.linspace(0.01, 0.2, 5),
    'xgb__reg_alpha': [0, 0.1, 1],
    'xgb__reg_lambda': [1, 5, 10]
}

xgb_grid = GridSearchCV(xgb_pipe,
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error',
                        cv=3)

In [16]:
xgb_grid.fit(data[features], data[target])

KeyboardInterrupt: 

In [None]:
linear_pipe = Pipeline([('impute', KNNImputer(n_neighbors=n_neighbors)),
                        ('scale', StandardScaler()), #For coefficient size comparison
                        ('linreg', LinearRegression())])

In [None]:
forest_pipe = Pipeline([('impute', KNNImputer()),
                        ('rfr', RandomForestRegressor())])