In [1]:
# Now we will begin to start implementing the ML algorithms. We will be eventually creating a pipeline for bagging and
# ensembling several estimators in order to create the most robust algorith.

In [2]:
# As usual, start out by importing the libraries we expect to use
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# importing pipeline, transformations, and CV/scoring

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.feature_selection import SelectKBest, f_regression

In [4]:
# setting up steps to import and split our data for the algorithm. However, we are going to scale our data in order to be more
# effective with the various ML algorithms
# next time set index to false for to_csv

df = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
df['YearBuiltSq'] = df['YearBuilt']*df['YearBuilt']

# df = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1', 'GrLivAreaNegSq', 'TotalBsmtSFNegSq'], axis=1)
# old df, with corrected models we can now fit the extra patterns

In [6]:
# creating a quick transformer to unlog our predictions
def transformer(num):
    return np.exp(num)

In [7]:
# next time in original data exploration/transformation functions, incorporate dtype switch
# creating complete list of categorical variables, afterwards will re-merge df
categorical = []
for x in list(df.columns):
    if 'Non' in x:
        categorical.append(x)
    elif 'Has' in x:
        categorical.append(x)
    elif 'SubClass' in x:
        categorical.append(x)
    elif 'SaleConditionNor' in x:
        categorical.append(x)
    elif 'LandSlopeGtl' in x:
        categorical.append(x)
    elif 'LotShapeReg' in x:
        categorical.append(x)
    elif 'HeatingGas' in x:
        categorical.append(x)      
    elif 'CentralAir' in x:
        categorical.append(x)
    elif 'MoSold' in x:
        categorical.append(x)
    elif 'YrSold' in x:
        categorical.append(x)
    
for x in list(df.select_dtypes(include='object').columns):
    categorical.append(x)
categorical = list(set(categorical))

# merging converted dfs

df[categorical] = df[categorical].astype('object')
df_num = df.drop(categorical, axis = 1)
df = pd.concat([df[categorical],df_num], axis=1)

In [8]:
# creating the X and y variables

X = df.drop(['LogSale','SalePrice'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = df['LogSale']

# creating test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33)

In [9]:
#importing regression models

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV, RidgeCV, Lasso, Ridge, LinearRegression, ElasticNetCV, ElasticNet

In [10]:
# going to ignore warnings to ignore the deprecation warnings

import warnings
warnings.filterwarnings('ignore')

In [11]:
pipe_lasso = [('scaler', RobustScaler()),
             ('clf', LassoCV())]
clf_lasso = Pipeline(pipe_lasso)
params = {'clf__n_alphas': [1,100,500]}
grid = GridSearchCV(clf_lasso, param_grid = params, cv = 5)
best_lasso = grid.fit(X_train,y_train).best_params_
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

clf_lasso.fit(X_train,y_train)
lasso_preds = clf_lasso.predict(X_test)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(lasso_preds))))
print(r2_score(transformer(y_test),transformer(lasso_preds)))

   mean_test_score  std_test_score  mean_train_score  std_train_score
0         0.895655        0.007777          0.905028          0.00601
1         0.895655        0.007777          0.905028          0.00601
2         0.895655        0.007777          0.905028          0.00601
0   -0.009373
1   -0.009373
2   -0.009373
dtype: float64
0.1261680685815775
0.8827279650298142


In [12]:
pipe_lasso = [('scaler', RobustScaler()),
             ('clf', LassoCV(n_alphas = best_lasso['clf__n_alphas']))]
clf_lasso = Pipeline(pipe_lasso)
scores = cross_validate(clf_lasso, X_train, y_train, scoring = 'r2', return_train_score=True)
scores['train_score']-scores['test_score']

array([ 0.02484082,  0.01273519, -0.00208323])

In [13]:
pipe_elastic = [('scaler', RobustScaler()),
             ('clf', ElasticNetCV())]
clf_elastic = Pipeline(pipe_elastic)
params = {'clf__n_alphas': [1,100,500]}
grid = GridSearchCV(clf_elastic, param_grid = params, cv = 5)
best_elastic = grid.fit(X_train,y_train).best_params_
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

clf_elastic.fit(X_train,y_train)
elastic_preds = clf_elastic.predict(X_test)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(elastic_preds))))
print(r2_score(transformer(y_test),transformer(elastic_preds)))

   mean_test_score  std_test_score  mean_train_score  std_train_score
0         0.895125        0.007497          0.904397         0.006194
1         0.895125        0.007497          0.904397         0.006194
2         0.895125        0.007497          0.904397         0.006194
0   -0.009272
1   -0.009272
2   -0.009272
dtype: float64
0.12593926570931568
0.8830801435048478


In [14]:
pipe_elastic = [('scaler', RobustScaler()),
             ('clf', ElasticNetCV(n_alphas = best_elastic['clf__n_alphas']))]
clf_elastic = Pipeline(pipe_elastic)
scores = cross_validate(clf_elastic, X_train, y_train, scoring = 'r2', return_train_score=True)
scores['train_score']-scores['test_score']

array([ 0.02539887,  0.012705  , -0.00317776])

In [15]:
pipe_ridge = [('scaler', RobustScaler()),
             ('clf', RidgeCV())]
clf_ridge = Pipeline(pipe_ridge)
params = {'clf__cv': [3,5]}
grid = GridSearchCV(clf_ridge, param_grid = params, cv = 5)
best_ridge = grid.fit(X_train,y_train).best_params_
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

clf_ridge.fit(X_train,y_train)
ridge_preds = clf_ridge.predict(X_test)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(ridge_preds))))
print(r2_score(transformer(y_test),transformer(ridge_preds)))

   mean_test_score  std_test_score  mean_train_score  std_train_score
0         0.913286        0.002216          0.944489         0.000418
1         0.913286        0.002216          0.944489         0.000418
0   -0.031203
1   -0.031203
dtype: float64
0.11664547100297276
0.9095162264241541


In [16]:
best_ridge['clf__cv']

3

In [17]:
pipe_ridge = [('scaler', RobustScaler()),
             ('clf', RidgeCV(cv = best_ridge['clf__cv']))]
clf_ridge = Pipeline(pipe_ridge)
scores = cross_validate(clf_ridge, X_train, y_train, scoring = 'r2', return_train_score=True)
scores['train_score']-scores['test_score']

array([0.04234309, 0.04079464, 0.02094412])

In [18]:
# creating our transformer to deal with numerical data, have already dummied categorical variables so we don't need to address
# those variables. However, will address skewed numerical variables before putting through to the standard scaler for all 
# numerical variables.

# list of highly skewed numerical variables
numerical = list(X.select_dtypes(include = ['int64', 'float64']).columns)
skewed = X[numerical].skew(axis=0).reset_index()
skewed_vars = list(skewed[abs(skewed[0])>2]['index'])


# making the transformers
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'median')),
                                ('scaler', RobustScaler())])
skewed_transformer = Pipeline(steps = [('skew_scaler', PowerTransformer())])


# preprocessing function to be implemented into the pipeline
preprocess = ColumnTransformer(transformers=[('num', numerical_transformer, numerical)])

In [19]:
pipe_rf = [('preprocess', preprocess),
             ('clf', RandomForestRegressor())]
clf_rf = Pipeline(pipe_rf)
params = {'clf__max_depth': range(5,26,5),
         'clf__max_features': range(5,20,5),
          'clf__n_estimators': [10,15,20,25]}
grid = GridSearchCV(clf_rf, param_grid = params, cv = 5)
best_rf = grid.fit(X_train,y_train).best_params_
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

    mean_test_score  std_test_score  mean_train_score  std_train_score
0          0.834681        0.027688          0.880553         0.002855
1          0.839129        0.020054          0.887750         0.002050
2          0.844494        0.024654          0.889248         0.004993
3          0.842844        0.020902          0.892781         0.002888
4          0.839532        0.017240          0.898908         0.002858
5          0.850406        0.021069          0.902509         0.002493
6          0.847872        0.022002          0.903336         0.003761
7          0.851918        0.020972          0.904531         0.001219
8          0.848599        0.020146          0.905151         0.002587
9          0.850507        0.014774          0.907382         0.002979
10         0.856578        0.016326          0.909372         0.002024
11         0.855356        0.020484          0.910799         0.002987
12         0.852089        0.015250          0.961767         0.002431
13    

In [20]:
print(best_rf)

{'clf__max_depth': 10, 'clf__max_features': 10, 'clf__n_estimators': 25}


In [21]:
pipe_rf = [('scaler', preprocess),
             ('clf', RandomForestRegressor(max_depth = best_rf['clf__max_depth'],
                                           max_features = best_rf['clf__max_features'],
                                           n_estimators = best_rf['clf__n_estimators']))]
clf_elastic = Pipeline(pipe_rf)
scores = cross_validate(clf_elastic, X_train, y_train, scoring = 'r2', return_train_score=True)
print('Train Scores')
print(scores['train_score'])
print()
print('Test Scores')
print(scores['test_score'])
print()
print('Train - Test Scores')
scores['train_score']-scores['test_score']

print()
print('Appears to be some overfitting with the random forest regressor, however, we will still try to use in our stacked model')

Train Scores
[0.97615269 0.97448584 0.97333396]

Test Scores
[0.85070971 0.87578994 0.88962127]

Train - Test Scores

Appears to be some overfitting with the random forest regressor, however, we will still try to use in our stacked model


In [22]:
# Overall, appears our models are doing well with not too high of variance and low bias. Going to create a stacked model with
# all four models

#going to reshuffle the train-test splot
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33)

In [23]:
# setting up our first level predictions
clf_lasso = LassoCV(cv=5)
clf_elastic = ElasticNetCV(cv=5)
clf_ridge = RidgeCV(cv=5)

classifiers = [clf_lasso, clf_elastic, clf_ridge]
predictions = []
pred_cols = []

for x in classifiers:
    pipe_steps = [('preprocess', preprocess),
                 ('clf', x)]
    pipe = Pipeline(pipe_steps)
    pred_name = 'pred_' + str(x)[0: str(x).find('(')]
    pred_cols.append(pred_name)
    pipe.fit(X_train,y_train)
    predictions.append(transformer(pipe.predict(X_train)))
    
first_level_preds = pd.DataFrame({'pred_lasso': predictions[0],
                                 'pred_elastic': predictions[1],
                                 'pred_ridge': predictions[2]})

X_train_2nd = pd.concat([X_train.reset_index(),first_level_preds], axis=1).set_index(keys='index')

stacked = RidgeCV(cv = 5)
stacked.fit(X_train_2nd,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [24]:
# setting up our second level predictions

predictions = []
pred_cols = []

for x in classifiers:
    pipe_steps = [('preprocess', preprocess),
                 ('clf', x)]
    pipe = Pipeline(pipe_steps)
    pred_name = 'pred_' + str(x)[0: str(x).find('(')]
    pred_cols.append(pred_name)
    predictions.append(transformer(pipe.predict(X_test)))
    
test_level_preds = pd.DataFrame({'pred_lasso': predictions[0],
                                 'pred_elastic': predictions[1],
                                 'pred_ridge': predictions[2]})

X_test_1st = pd.concat([X_test.reset_index(),test_level_preds], axis=1).set_index(keys='index')

results = cross_val_score(stacked, X_train, y_train, scoring = 'r2', cv = 5)
print(results)

stacked_preds = stacked.predict(X_test_1st)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(stacked_preds))))
print(r2_score(transformer(y_test),transformer(stacked_preds)))

[0.91832222 0.92397493 0.93761154 0.88947616 0.91107905]
0.11670295122925003
0.9218382561389564


In [25]:
# There we go. As we can see, we were able to improve our model slightly by stacking our regressors. In the future, we could
# try to add some more regressors with high performance that were slightly less correlated with our other models to try improve
# the stacking process.

In [26]:
""" for if I decide to try with dropped variables, although no longer necessary 


drop_list = []
drop_list.append('Condition1')
drop_list.append('HasGarage')
drop_list.append('LotConfig')
drop_list.append('RoofStyle')
drop_list.append('BsmtFinType1')
drop_list.append('Functional')
drop_list.append('LandContour')
drop_list.append('GarageType')

for x in list(df.select_dtypes(include = ['int64', 'float64']).columns):
    if '*' in x:
        drop_list.append(x)
        
df = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1', 'GrLivAreaNegSq', 'TotalBsmtSFNegSq'], axis=1)
df = df.drop(drop_list, axis=1)
df['YearBuiltSq'] = df['YearBuilt']*df['YearBuilt']

categorical = []
for x in list(df.columns):
    if 'Non' in x:
        categorical.append(x)
    elif 'Has' in x:
        categorical.append(x)
    elif 'SubClass' in x:
        categorical.append(x)
    elif 'SaleConditionNor' in x:
        categorical.append(x)
    elif 'LandSlopeGtl' in x:
        categorical.append(x)
    elif 'LotShapeReg' in x:
        categorical.append(x)
    elif 'HeatingGas' in x:
        categorical.append(x)      
    elif 'CentralAir' in x:
        categorical.append(x)
    elif 'MoSold' in x:
        categorical.append(x)
    elif 'YrSold' in x:
        categorical.append(x)
    
for x in list(df.select_dtypes(include='object').columns):
    categorical.append(x)
categorical = list(set(categorical))

# merging converted dfs

df[categorical] = df[categorical].astype('object')
df_num = df.drop(categorical, axis = 1)
df = pd.concat([df[categorical],df_num], axis=1)

# creating the X and y variables

X = df.drop(['LogSale','SalePrice'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = df['LogSale']

numerical = list(X.select_dtypes(include = ['int64', 'float64']).columns)
skewed = X[numerical].skew(axis=0).reset_index()
skewed_vars = list(skewed[abs(skewed[0])>2]['index'])

# creating test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33)"""

" for if I decide to try with dropped variables, although no longer necessary \n\n\ndrop_list = []\ndrop_list.append('Condition1')\ndrop_list.append('HasGarage')\ndrop_list.append('LotConfig')\ndrop_list.append('RoofStyle')\ndrop_list.append('BsmtFinType1')\ndrop_list.append('Functional')\ndrop_list.append('LandContour')\ndrop_list.append('GarageType')\n\nfor x in list(df.select_dtypes(include = ['int64', 'float64']).columns):\n    if '*' in x:\n        drop_list.append(x)\n        \ndf = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1', 'GrLivAreaNegSq', 'TotalBsmtSFNegSq'], axis=1)\ndf = df.drop(drop_list, axis=1)\ndf['YearBuiltSq'] = df['YearBuilt']*df['YearBuilt']\n\ncategorical = []\nfor x in list(df.columns):\n    if 'Non' in x:\n        categorical.append(x)\n    elif 'Has' in x:\n        categorical.append(x)\n    elif 'SubClass' in x:\n        categorical.append(x)\n    elif 'SaleConditionNor' in x:\n        categorical.append(x)\n    elif 'LandSlopeGtl' in x:\n 