In [1]:
# Now we will begin to start implementing the ML algorithms. We will be eventually creating a pipeline for bagging and
# ensembling several estimators in order to create the most robust algorith.

In [2]:
# As usual, start out by importing the libraries we expect to use
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# importing pipeline, transformations, and CV/scoring

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.feature_selection import SelectKBest, f_regression

In [4]:
# setting up steps to import and split our data for the algorithm. However, we are going to scale our data in order to be more
# effective with the various ML algorithms
# next time set index to false for to_csv

df = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
df['YearBuiltSq'] = df['YearBuilt']*df['YearBuilt']

# df = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1', 'GrLivAreaNegSq', 'TotalBsmtSFNegSq'], axis=1)
# old df, with corrected models we can now fit the extra patterns

In [5]:
# creating a quick transformer to unlog our predictions
def transformer(num):
    return np.exp(num)

In [6]:
# next time in original data exploration/transformation functions, incorporate dtype switch
# creating complete list of categorical variables, afterwards will re-merge df
categorical = []
for x in list(df.columns):
    if 'Non' in x:
        categorical.append(x)
    elif 'Has' in x:
        categorical.append(x)
    elif 'SubClass' in x:
        categorical.append(x)
    elif 'SaleConditionNor' in x:
        categorical.append(x)
    elif 'LandSlopeGtl' in x:
        categorical.append(x)
    elif 'LotShapeReg' in x:
        categorical.append(x)
    elif 'HeatingGas' in x:
        categorical.append(x)      
    elif 'CentralAir' in x:
        categorical.append(x)
    elif 'MoSold' in x:
        categorical.append(x)
    elif 'YrSold' in x:
        categorical.append(x)
    
for x in list(df.select_dtypes(include='object').columns):
    categorical.append(x)
categorical = list(set(categorical))

# merging converted dfs

df[categorical] = df[categorical].astype('object')
df_num = df.drop(categorical, axis = 1)
df = pd.concat([df[categorical],df_num], axis=1)

In [7]:
# creating the X and y variables

X = df.drop(['LogSale','SalePrice'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = df['LogSale']

# creating test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33)

In [8]:
#importing regression models

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV, RidgeCV, Lasso, Ridge, LinearRegression, ElasticNetCV, ElasticNet

In [9]:
# going to ignore warnings to ignore the deprecation warnings

import warnings
warnings.filterwarnings('ignore')

In [10]:
pipe_lasso = [('scaler', StandardScaler()),
             ('clf', Lasso())]
clf_lasso = Pipeline(pipe_lasso)
params = {'clf__alpha': [0.001,0.01,0.1,1,10]}
grid = GridSearchCV(clf_lasso, param_grid = params, cv = 3,return_train_score=True)
best_lasso = grid.fit(X_train,y_train).best_params_
print(best_lasso)
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

pipe_lasso = [('scaler', RobustScaler()),
             ('clf', Lasso(alpha = best_lasso['clf__alpha']))]
clf_lasso = Pipeline(pipe_lasso)
clf_lasso.fit(X_train,y_train)
lasso_preds = clf_lasso.predict(X_test)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(lasso_preds))))
print(r2_score(transformer(y_test),transformer(lasso_preds)))

{'clf__alpha': 0.001}
   mean_test_score  std_test_score  mean_train_score  std_train_score
0         0.896585        0.007982          0.948758         0.001428
1         0.896350        0.006087          0.917371         0.003174
2         0.707029        0.023850          0.712811         0.011790
3        -0.011356        0.008091          0.000000         0.000000
4        -0.011356        0.008091          0.000000         0.000000
0   -0.052173
1   -0.021021
2   -0.005782
3   -0.011356
4   -0.011356
dtype: float64
0.11419909330804812
0.9330587801563293


In [11]:
pipe_lasso = [('scaler', RobustScaler()),
             ('clf', Lasso(alpha = best_lasso['clf__alpha']))]
clf_lasso = Pipeline(pipe_lasso)
scores = cross_validate(clf_lasso, X_train, y_train, scoring = 'r2', return_train_score=True)
print(scores['train_score']-scores['test_score'])
print(np.std(scores['train_score']-scores['test_score']))

[0.0472431  0.01810335 0.01352342]
0.014933621567095809


In [12]:
pipe_lasso = [('scaler', StandardScaler()),
             ('clf', LassoCV())]
clf_lasso = Pipeline(pipe_lasso)
params = {
         'clf__eps': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(clf_lasso, param_grid = params, cv = 3,return_train_score = True)
best_lasso = grid.fit(X_train,y_train).best_params_

print(best_lasso)
print()
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

clf_lasso.fit(X_train,y_train)
lasso_preds = clf_lasso.predict(X_test)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(lasso_preds))))
print(r2_score(transformer(y_test),transformer(lasso_preds)))

{'clf__eps': 0.0001}

   mean_test_score  std_test_score  mean_train_score  std_train_score
0         0.903735        0.007802          0.938748         0.002718
1         0.903662        0.007755          0.938703         0.003153
2         0.903662        0.007755          0.938703         0.003153
0   -0.035013
1   -0.035041
2   -0.035041
dtype: float64
0.11466676747456364
0.9315844957002128


In [13]:
pipe_elastic = [('scaler', RobustScaler()),
             ('clf', ElasticNetCV())]
clf_elastic = Pipeline(pipe_elastic)
params = {'clf__n_alphas': [1,100,500]}
grid = GridSearchCV(clf_elastic, param_grid = params, cv = 5, return_train_score=True)
best_elastic = grid.fit(X_train,y_train).best_params_
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

clf_elastic.fit(X_train,y_train)
elastic_preds = clf_elastic.predict(X_test)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(elastic_preds))))
print(r2_score(transformer(y_test),transformer(elastic_preds)))

   mean_test_score  std_test_score  mean_train_score  std_train_score
0         0.895402         0.00681          0.909704         0.001928
1         0.895402         0.00681          0.909704         0.001928
2         0.895402         0.00681          0.909704         0.001928
0   -0.014302
1   -0.014302
2   -0.014302
dtype: float64
0.1217141476956355
0.9221578102806048


In [14]:
pipe_elastic = [('scaler', RobustScaler()),
             ('clf', ElasticNetCV(n_alphas = best_elastic['clf__n_alphas']))]
clf_elastic = Pipeline(pipe_elastic)
scores = cross_validate(clf_elastic, X_train, y_train, scoring = 'r2', return_train_score=True)
print(scores['train_score']-scores['test_score'])
print(np.std(scores['train_score']-scores['test_score']))

[0.03215956 0.01620559 0.00400211]
0.011529169796845113


In [15]:
pipe_ridge = [('scaler', RobustScaler()),
             ('clf', RidgeCV())]
clf_ridge = Pipeline(pipe_ridge)
params = {'clf__cv': [3,5]}
grid = GridSearchCV(clf_ridge, param_grid = params, cv = 5, return_train_score=True)
best_ridge = grid.fit(X_train,y_train).best_params_
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

clf_ridge.fit(X_train,y_train)
ridge_preds = clf_ridge.predict(X_test)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(ridge_preds))))
print(r2_score(transformer(y_test),transformer(ridge_preds)))

   mean_test_score  std_test_score  mean_train_score  std_train_score
0         0.909231        0.006662          0.940464         0.001335
1         0.909231        0.006662          0.940464         0.001335
0   -0.031232
1   -0.031232
dtype: float64
0.11650900606033948
0.930053558349081


In [16]:
best_ridge['clf__cv']

3

In [17]:
pipe_ridge = [('scaler', RobustScaler()),
             ('clf', RidgeCV(cv = best_ridge['clf__cv']))]
clf_ridge = Pipeline(pipe_ridge)
scores = cross_validate(clf_ridge, X_train, y_train, scoring = 'r2', return_train_score=True)
print(scores['train_score']-scores['test_score'])
print(np.std(scores['train_score']-scores['test_score']))

[0.05626941 0.03339184 0.02572749]
0.01297405507175217


In [18]:
# creating our transformer to deal with numerical data, have already dummied categorical variables so we don't need to address
# those variables. In some cases, we would want to deal with the skewed variables by using the PowerTransformer() function from
# sci-kit learn, which uses the Yeo-Johnson method for normalizing skewed datasets. However, for the sake of using a single
# scaler/transformer, I use the robust scaler which helps with reducing the impact of outliers (which is particularly beneficial
# for the use of the Lasso and ElasticNet algorithms), as well as reducing the skew of the variables.

# list of highly skewed numerical variables
numerical = list(X.select_dtypes(include = ['int64', 'float64']).columns)
# skewed = X[numerical].skew(axis=0).reset_index()
# skewed_vars = list(skewed[abs(skewed[0])>2]['index'])


# making the transformers
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'median')),
                                ('scaler', RobustScaler())])
# skewed_transformer = Pipeline(steps = [('skew_scaler', PowerTransformer())])


# preprocessing function to be implemented into the pipeline
preprocess = ColumnTransformer(transformers=[('num', numerical_transformer, numerical)])

In [19]:
pipe_rf = [('preprocess', preprocess),
             ('clf', RandomForestRegressor())]
clf_rf = Pipeline(pipe_rf)
params = {'clf__max_depth': range(5,26,5),
         'clf__max_features': range(5,20,5),
          'clf__n_estimators': [10,15,20,25]}
grid = GridSearchCV(clf_rf, param_grid = params, cv = 5, return_train_score=True)
best_rf = grid.fit(X_train,y_train).best_params_
results = pd.DataFrame(grid.cv_results_)
print(results[['mean_test_score','std_test_score','mean_train_score','std_train_score']])
print(results['mean_test_score']-results['mean_train_score'])

    mean_test_score  std_test_score  mean_train_score  std_train_score
0          0.828215        0.017562          0.875399         0.007364
1          0.817103        0.009221          0.880399         0.004258
2          0.827053        0.011358          0.885590         0.003371
3          0.829639        0.011203          0.884693         0.004877
4          0.834234        0.021144          0.895071         0.002705
5          0.833713        0.012300          0.898321         0.004308
6          0.839345        0.013686          0.897966         0.003444
7          0.836889        0.012665          0.901731         0.002889
8          0.835346        0.015318          0.903685         0.003744
9          0.840393        0.012621          0.906239         0.005376
10         0.843796        0.011954          0.906259         0.007173
11         0.841718        0.007902          0.905979         0.003456
12         0.850543        0.015531          0.961402         0.001779
13    

In [20]:
print(best_rf)

{'clf__max_depth': 15, 'clf__max_features': 15, 'clf__n_estimators': 25}


In [21]:
pipe_rf = [('scaler', preprocess),
             ('clf', RandomForestRegressor(max_depth = best_rf['clf__max_depth'],
                                           max_features = best_rf['clf__max_features'],
                                           n_estimators = best_rf['clf__n_estimators']))]
clf_elastic = Pipeline(pipe_rf)
scores = cross_validate(clf_elastic, X_train, y_train, scoring = 'r2', return_train_score=True)
print('Train Scores')
print(scores['train_score'])
print()
print('Test Scores')
print(scores['test_score'])
print()
print('Train - Test Scores')
scores['train_score']-scores['test_score']

print()
print('Appears to be some overfitting with the random forest regressor, however, we will still try to use in our stacked model')

Train Scores
[0.97897937 0.97760002 0.97982132]

Test Scores
[0.86779439 0.85884143 0.85965356]

Train - Test Scores

Appears to be some overfitting with the random forest regressor, however, we will still try to use in our stacked model


In [22]:
# Overall, appears our models are doing well with not too high of variance and low bias. Going to create a stacked model with
# all four models

# going to reshuffle the train-test splot
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33)

In [23]:
# setting up our first level predictions
clf_lasso = LassoCV(cv=5)
clf_elastic = ElasticNetCV(cv=5)
clf_ridge = RidgeCV(cv=5)

classifiers = [clf_lasso, clf_elastic, clf_ridge]
predictions = []
pred_cols = []

for x in classifiers:
    pipe_steps = [('preprocess', preprocess),
                 ('clf', x)]
    pipe = Pipeline(pipe_steps)
    pred_name = 'pred_' + str(x)[0: str(x).find('(')]
    pred_cols.append(pred_name)
    pipe.fit(X_train,y_train)
    predictions.append(transformer(pipe.predict(X_train)))
    
first_level_preds = pd.DataFrame({'pred_lasso': predictions[0],
                                 'pred_elastic': predictions[1],
                                 'pred_ridge': predictions[2]})

X_train_2nd = pd.concat([X_train.reset_index(),first_level_preds], axis=1).set_index(keys='index')

stacked = RidgeCV(cv = 5)
stacked.fit(X_train_2nd,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [24]:
# setting up our second level predictions

predictions = []
pred_cols = []

for x in classifiers:
    pipe_steps = [('preprocess', preprocess),
                 ('clf', x)]
    pipe = Pipeline(pipe_steps)
    pred_name = 'pred_' + str(x)[0: str(x).find('(')]
    pred_cols.append(pred_name)
    predictions.append(transformer(pipe.predict(X_test)))
    
test_level_preds = pd.DataFrame({'pred_lasso': predictions[0],
                                 'pred_elastic': predictions[1],
                                 'pred_ridge': predictions[2]})

X_test_1st = pd.concat([X_test.reset_index(),test_level_preds], axis=1).set_index(keys='index')

results = cross_val_score(stacked, X_train, y_train, scoring = 'r2', cv = 5)
print(results)
print(np.std(results))
print()

stacked_preds = stacked.predict(X_test_1st)
print(np.sqrt(mean_squared_log_error(transformer(y_test),transformer(stacked_preds))))
print(r2_score(transformer(y_test),transformer(stacked_preds)))

[0.8597938  0.91331485 0.91882004 0.86952702 0.94594065]
0.032175214648617834

0.11286465813262073
0.927311374211476


In [25]:
# There we go. As we can see, we were able to improve our model slightly by stacking our regressors. In the future, we could
# try to add some more regressors with high performance that were slightly less correlated with our other models to try improve
# the stacking process.

In [26]:
""" for if I decide to try with dropped variables, although no longer necessary 


drop_list = []
drop_list.append('Condition1')
drop_list.append('HasGarage')
drop_list.append('LotConfig')
drop_list.append('RoofStyle')
drop_list.append('BsmtFinType1')
drop_list.append('Functional')
drop_list.append('LandContour')
drop_list.append('GarageType')

for x in list(df.select_dtypes(include = ['int64', 'float64']).columns):
    if '*' in x:
        drop_list.append(x)
        
df = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1', 'GrLivAreaNegSq', 'TotalBsmtSFNegSq'], axis=1)
df = df.drop(drop_list, axis=1)
df['YearBuiltSq'] = df['YearBuilt']*df['YearBuilt']

categorical = []
for x in list(df.columns):
    if 'Non' in x:
        categorical.append(x)
    elif 'Has' in x:
        categorical.append(x)
    elif 'SubClass' in x:
        categorical.append(x)
    elif 'SaleConditionNor' in x:
        categorical.append(x)
    elif 'LandSlopeGtl' in x:
        categorical.append(x)
    elif 'LotShapeReg' in x:
        categorical.append(x)
    elif 'HeatingGas' in x:
        categorical.append(x)      
    elif 'CentralAir' in x:
        categorical.append(x)
    elif 'MoSold' in x:
        categorical.append(x)
    elif 'YrSold' in x:
        categorical.append(x)
    
for x in list(df.select_dtypes(include='object').columns):
    categorical.append(x)
categorical = list(set(categorical))

# merging converted dfs

df[categorical] = df[categorical].astype('object')
df_num = df.drop(categorical, axis = 1)
df = pd.concat([df[categorical],df_num], axis=1)

# creating the X and y variables

X = df.drop(['LogSale','SalePrice'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = df['LogSale']

numerical = list(X.select_dtypes(include = ['int64', 'float64']).columns)
skewed = X[numerical].skew(axis=0).reset_index()
skewed_vars = list(skewed[abs(skewed[0])>2]['index'])

# creating test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33)"""

" for if I decide to try with dropped variables, although no longer necessary \n\n\ndrop_list = []\ndrop_list.append('Condition1')\ndrop_list.append('HasGarage')\ndrop_list.append('LotConfig')\ndrop_list.append('RoofStyle')\ndrop_list.append('BsmtFinType1')\ndrop_list.append('Functional')\ndrop_list.append('LandContour')\ndrop_list.append('GarageType')\n\nfor x in list(df.select_dtypes(include = ['int64', 'float64']).columns):\n    if '*' in x:\n        drop_list.append(x)\n        \ndf = pd.read_csv('ml_ready.csv').drop(['Unnamed: 0', 'Unnamed: 0.1', 'GrLivAreaNegSq', 'TotalBsmtSFNegSq'], axis=1)\ndf = df.drop(drop_list, axis=1)\ndf['YearBuiltSq'] = df['YearBuilt']*df['YearBuilt']\n\ncategorical = []\nfor x in list(df.columns):\n    if 'Non' in x:\n        categorical.append(x)\n    elif 'Has' in x:\n        categorical.append(x)\n    elif 'SubClass' in x:\n        categorical.append(x)\n    elif 'SaleConditionNor' in x:\n        categorical.append(x)\n    elif 'LandSlopeGtl' in x:\n 