# Regression (...continued)

## further methods & comparison

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
import statsmodels.api as sm
import seaborn as sns
import pickle

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score,  mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.utils import resample


# Set up plotting options for seaborn and matplotlib
sns.set_context('notebook') 
sns.set_style('ticks') 
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

In [None]:
# load from previous lessons
cached_files = ['models/ames_train_y.pickle','models/ames_test_y.pickle',
                'models/ames_train_X.pickle','models/ames_test_X.pickle',
                'models/predictors.pickle','models/ames_ols_all.pickle',
                'models/ames_ridge.pickle','models/ames_lasso.pickle', 
                'models/ames_enet.pickle']

for file in cached_files:
    with open(file, 'rb') as f:
        objectname = file.replace('models/', '').replace('.pickle', '')
        exec(objectname + " = pickle.load(f)")
        f.close()

## Random Forest
In random forest, each tree in the ensemble is built from a bootstrap sample from the training set. In addition, when splitting a node during the construction of the tree, the split that is chosen is the best split among a random subset of the features. The scikit-learn implementation combines classifiers by averaging their probabilistic prediction, instead of letting each classifier vote for a single class.

In [None]:
# tuning grid will be defined to optimise the following RF parameters:
# n_estimators
# min_samples_leaf

nEstimators_arr = [50] #np.arange(50,120,30)
minSampLeaf_arr = [3] #np.arange(2,7,2)
param_grid_RF = {"n_estimators": nEstimators_arr,
                 "min_samples_leaf": minSampLeaf_arr}

In [None]:
from sklearn.ensemble import RandomForestRegressor

ames_RF = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', GridSearchCV(RandomForestRegressor(), param_grid_RF, scoring='r2', cv=10))
])

## Toggle comment below to build model
ames_RF.fit(ames_train_X, ames_train_y)
pickle.dump(ames_RF, open('models/ames_rforest.pickle', 'wb'))
#with open('models/ames_rforest.pickle', 'rb') as f:
#    ames_RF = pickle.load(f)

In [None]:
best_RF = ames_RF.named_steps.estimator.best_estimator_
print(best_RF)

> ## Challenge 1
>
> 1. Look at the coefficients for the model above. How many trees are worth combining?
> 2. Which minimum leaf size is best?
> 
> {: .source}
>
> > ## Solution
> > 
> > 1.
> > 2. 
> > {: .output}
> {: .solution}
{: .challenge}

In [None]:
def plot_coefficients(model, labels):
    importance = model.feature_importances_

    table = pd.Series(importance.ravel(), index = labels).sort_values(ascending=True, inplace=False)
    
    reference = pd.Series(np.abs(importance.ravel()), index = labels).sort_values(ascending=False, inplace=False)
    reference = reference.iloc[:20]
    table = table[reference.index]
    table = table.sort_values(ascending=True, inplace=False)

    fig, ax = fig, ax = plt.subplots()
    table.T.plot(kind='barh', edgecolor='black', width=0.7, linewidth=.8, alpha=0.9, ax=ax)
    ax.tick_params(axis=u'y', length=0) 
    ax.set_title('Feature Importances (twenty largest in absolute value)', fontsize=14)
    sns.despine()
    return fig, ax

In [None]:
plot_coefficients(best_RF, predictors)
plt.show()

## k-Nearest Neighbours Regression

In [None]:
# tuning grid will be defined to optimise the following kNN parameters:
# n_neighbors
# weights
nNeighbors_arr = [10] #np.arange(4,20,2)
weights_arr = ['uniform'] #['uniform','distance']
param_grid_kNN = {"n_neighbors": nNeighbors_arr,
                  "weights": weights_arr}

In [None]:
from sklearn.neighbors import KNeighborsRegressor

ames_kNN = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', GridSearchCV(KNeighborsRegressor(), param_grid_kNN, scoring='r2', cv=10))
])


## Toggle comment below to build model
ames_kNN.fit(ames_train_X, ames_train_y)
pickle.dump(ames_kNN, open('models/ames_knn.pickle', 'wb'))
#with open('models/ames_knn.pickle', 'rb') as f:
#    ames_kNN = pickle.load(f)


In [None]:
best_kNN = ames_kNN.named_steps.estimator.best_estimator_
print(best_kNN)

## Compare Models

In [None]:
# What was the RMSE on the training data?
columns=['Train RMSE']
rows=['OLS','Ridge', 'Lasso', 'ENet', 'Random Forest', 'k Nearest Neighbours']
results=pd.DataFrame(0.0, columns=columns, index=rows) 

methods=[ames_ols_all, ames_ridge, ames_lasso, ames_enet, ames_RF, ames_kNN]

for i, method in enumerate(methods):
    y_pred=method.predict(ames_train_X)
    results.iloc[i,0] = np.sqrt(mean_squared_error(10**ames_train_y, 10**y_pred))

results.round(2)

In [None]:
# Compare with the test data!
columns=['Test RMSE']
rows=['OLS','Ridge', 'Lasso', 'ENet', 'Random Forest', 'k Nearest Neighbours']
results=pd.DataFrame(0.0, columns=columns, index=rows) 

methods=[ames_ols_all,  ames_ridge, ames_lasso, ames_enet, ames_RF, ames_kNN]

for i, method in enumerate(methods):
    y_pred=method.predict(ames_test_X)
    results.iloc[i,0] = np.sqrt(mean_squared_error(10**ames_test_y, 10**y_pred))

results.round(2)