In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

#We may want these at some point for transforming our output:
#from scipy.special import logit, expit

In [None]:
%run ../data/features-grouped.py
%run ../data/model-information.py

In [7]:
filepath = '../data/data-reduced-train.csv'
data = pd.read_csv(filepath)

In [27]:
data_train, data_val = train_test_split(data,
                                        test_size=0.2,
                                        random_state=42)

In [8]:
#n_neighbors = 10 #number of neighbors for kNN imputation
features = data.columns[4:]
target = '% Adults with Diabetes'

In [28]:
xgb_pipe = Pipeline([('impute', KNNImputer()),
                     ('xgb', XGBRegressor())])

param_grid = {
    'impute__n_neighbors': [10, 15],
    'xgb__n_estimators': [50, 100, 150],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.1, 0.15],
    'xgb__reg_alpha': [0, 0.1, 1],
    'xgb__reg_lambda': [0, 1, 5]
}

xgb_grid = GridSearchCV(xgb_pipe,
                        param_grid=param_grid,
                        scoring='neg_root_mean_squared_error',
                        cv=3)

In [29]:
xgb_grid.fit(data_train[features], data_train[target])

  _data = np.array(data, dtype=dtype, copy=copy,


In [34]:
xgb_grid.best_params_

{'impute__n_neighbors': 10,
 'xgb__learning_rate': 0.15,
 'xgb__max_depth': 3,
 'xgb__n_estimators': 150,
 'xgb__reg_alpha': 0,
 'xgb__reg_lambda': 5}

In [35]:
xgb_grid.best_score_

np.float64(-0.522992124902723)

In [36]:
xgb_grid.best_estimator_

In [37]:
model = xgb_grid.best_estimator_

model.fit(data_train[features], data_train[target])

training_preds = model.predict(data_train[features])
root_mean_squared_error(data_train[target], training_preds)

np.float64(0.28639533372919745)

In [38]:
val_preds = model.predict(data_val[features])
root_mean_squared_error(data_val[target], val_preds)

np.float64(0.5201252506177477)

In [None]:
params = {
    'n_estimators': 150,
    'max_depth': 5,
    'learning_rate': 0.1,
    'reg_alpha': 1,
    'reg_lambda': 1
}

In [24]:
xgb_pipe_new = xgb_pipe = Pipeline([('impute', KNNImputer(n_neighbors=15)),
                     ('xgb', XGBRegressor(**params))])

xgb_pipe_new.fit(data[features], data[target])

In [None]:
preds = xgb_pipe_new.predict(data[features])
root_mean_squared_error(data[target], preds)

np.float64(0.1670568370947467)

In [39]:
linear_pipe = Pipeline([('impute', KNNImputer(n_neighbors=10)),
                        ('scale', StandardScaler()), #For coefficient size comparison
                        ('linreg', LinearRegression())])

linear_pipe.fit(data_train[features], data_train[target])


In [43]:
linear_training_preds = linear_pipe.predict(data_train[features])
print(root_mean_squared_error(data_train[target], linear_training_preds))

linear_val_preds = linear_pipe.predict(data_val[features])
print(root_mean_squared_error(data_val[target], linear_val_preds))

0.472038732165765
0.4969364285883717


In [None]:
forest_pipe = Pipeline([('impute', KNNImputer()),
                        ('rfr', RandomForestRegressor())])