In [5]:
# Import our libraries.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv('./datasets/df_corr_droptop_highcorr.csv')

In [3]:
# Create X and y variables; train/test split

X = df.drop(columns = ['CPI'])
y = df['CPI']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Modeling
Evaluate four regression models (Ridge, Lasso, KNeighbors, Random Forest) to determine how to boost and/or stack. 

In [11]:
# Instantiate pipelines 

# RidgeCV
ridge_cv_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('ridge_cv', RidgeCV())
])

# LassoCV
lasso_cv_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('lasso_cv', LassoCV())
])

# KNeighborsRegressor
knn_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

# RandomForestRegressor
rf_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('rf', RandomForestRegressor())
])

In [47]:
# Set parameters for each pipeline

# RidgeCV pipeline parameters
ridge_cv_pipeline_params = {
    'ridge_cv__alphas': range(1,11)
}

# LassoCV pipeline parameters
lasso_cv_pipeline_params = {
    'lasso_cv__alphas': [None]
}

# KNeighborsRegressor pipeline parameters
knn_pipeline_params = {
    'knn__n_neighbors': range(1, 50, 2)
}

# RandomForestRegressor pipeline parameters
rf_pipeline_params = {
    'rf__n_estimators': range(250, 500, 50),
    'rf__max_depth': [None, 5, 10]
}

## Instantiate GridSearchCV objects

In [48]:
ridge_cv_gs = GridSearchCV(ridge_cv_pipe,
                       ridge_cv_pipeline_params,
                       cv = 5)

lasso_cv_gs = GridSearchCV(lasso_cv_pipe,
                       lasso_cv_pipeline_params,
                       cv = 5)

knn_gs = GridSearchCV(knn_pipe,
                       knn_pipeline_params,
                       cv = 5)

rf_gs = GridSearchCV(rf_pipe,
                       rf_pipeline_params,
                       cv = 5)

## Fit train data to each gridsearch

In [51]:
ridge_cv_gs.fit(X_train, y_train)

In [52]:
lasso_cv_gs.fit(X_train, y_train)

In [53]:
knn_gs.fit(X_train, y_train)

In [49]:
rf_gs.fit(X_train, y_train)

In [54]:
# Output best parameters, best score, test score

gs_dict = {'RidgeCV' : ridge_cv_gs, 
           'LassoCV' : lasso_cv_gs, 
           'KNeighborsRegressor' : knn_gs, 
           'RandomForestRegressor' : rf_gs, 
          }

for key, value in gs_dict.items():
    print('=' * 40)
    print(key)
    print(f'Train Score: {(value.best_score_).round(3)}')
    print(f'Test Score: {(value.score(X_test, y_test)).round(3)}')
    print(f'Best Parameters: {value.best_params_}')

RidgeCV
Train Score: 0.838
Test Score: 0.836
Best Parameters: {'ridge_cv__alphas': 5}
LassoCV
Train Score: 0.837
Test Score: 0.837
Best Parameters: {'lasso_cv__alphas': None}
KNeighborsRegressor
Train Score: 0.969
Test Score: 0.972
Best Parameters: {'knn__n_neighbors': 1}
RandomForestRegressor
Train Score: 0.97
Test Score: 0.975
Best Parameters: {'rf__max_depth': None, 'rf__n_estimators': 350}


## Boost KNN

In [64]:
ada = AdaBoostRegressor(estimator = KNeighborsRegressor())

ada_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.9, 1.1]
}

gs = GridSearchCV(ada, param_grid=ada_params, cv = 3)

gs.fit(X_train, y_train)

In [65]:
print('Boosted KNN')
print(f'Train Score: {(gs.best_score_).round(3)}')
print(f'Test Score: {(gs.score(X_test, y_test)).round(3)}')
print(f'Best Parameters: {gs.best_params_}')

Boosted KNN
Train Score: -0.137
Test Score: -0.303
Best Parameters: {'learning_rate': 0.9, 'n_estimators': 50}
