# 03_model_exploration: Try different ML models on data

Date: 2022-06-13



## Load Packages and Data

In [37]:
import os
#os.chdir('..')
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [74]:
# read in data sets
training_cleaned = pd.read_pickle('cleaned_data/training_cleaned.pkl')
training_prices = pd.read_pickle('cleaned_data/training_prices.pkl')
test_orig = pd.read_csv('data/test.csv')
test = pd.read_pickle('cleaned_data/test_cleaned.pkl')

In [13]:
training_cleaned.head()

Unnamed: 0,x0_120,x0_160,x0_180,x0_190,x0_20,x0_30,x0_40,x0_45,x0_50,x0_60,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Remodeled
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.111517,0.0,0.0,0.0,0.0,0.0,0.090909,0.5,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.347725,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.25,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.076782,0.0,0.0,0.0,0.0,0.0,0.727273,0.5,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.063985,0.492754,0.0,0.0,0.0,0.0,0.090909,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.224037,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0


In [14]:
training_prices.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [43]:
training_cleaned.shape

(1460, 317)

In [44]:
X = training_cleaned.to_numpy()

#log transform the prices
y = np.log10(training_prices)

## Try a couple of machine learning models 

### Decision Tree 

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz

In [85]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X, y)

DecisionTreeRegressor()

In [86]:
housing_predictions = tree_reg.predict(X)
tree_mse = mean_squared_error(y, housing_predictions) 
tree_rmse = np.sqrt(tree_mse) 
tree_rmse

0.0

At a maximum depth of 30, the RMSE goes to 0. At 20, it is still non-zero. 

In [87]:
# Try cross validation 
scores = cross_val_score(tree_reg, X, y, scoring = "neg_mean_squared_error", cv = 3)
tree_rmse_scores = np.sqrt(-scores)

In [88]:
tree_rmse_scores.mean()

0.09031240435399633

In [89]:
tree_rmse_scores.std()

0.0015265919063526064

In [75]:
# predict on test data
predictions = tree_reg.predict(test)

In [83]:
# function to put predicted data in submission format 
def submission_format(predictions, test_orig):
    """
    predictions: output of predict() from running model 
    test_orig: original test data to get ID 
    
    returns: predictions with ID and applied exponential to undo log transform
    """
    # create data frame with id
    predictions_df = pd.DataFrame({'Id': test_orig['Id'], 
                               'SalePrice': predictions.tolist()})
    # log transform back the sale price
    predictions_df['SalePrice'] = 10**predictions_df['SalePrice']
    
    return predictions_df

In [84]:
predictions_df = submission_format(predictions, test_orig)
predictions_df.head()

Unnamed: 0,Id,SalePrice
0,1461,128200.0
1,1462,155000.0
2,1463,190000.0
3,1464,175000.0
4,1465,184100.0


In [82]:
predictions_df.to_csv('results/prediction_dt.csv', index = False)

### Random Forest 

In [90]:
from sklearn.ensemble import RandomForestRegressor

In [92]:
forest_reg = RandomForestRegressor()
forest_reg.fit(X, y)

RandomForestRegressor()

In [94]:
rf_predictions = forest_reg.predict(X)
rf_mse = mean_squared_error(y, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_mse

0.0005255028133618755

In [95]:
# Try cross validation 
scores = cross_val_score(forest_reg, X, y, scoring = "neg_mean_squared_error", cv = 3)
rf_rmse_scores = np.sqrt(-scores)

In [96]:
rf_rmse_scores.mean()

0.0634144428784711

In [97]:
rf_rmse_scores.std()

0.003281452313616673

### Gradient Boosted Decision Trees

In [113]:
import xgboost

In [114]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X, y)
xgb_pred = xgb_reg.predict(X)

In [116]:
xgb_mse = mean_squared_error(y, xgb_pred)
xgb_mse

2.361433594508727e-05

In [128]:
# try cross validation 
data_dmatrix = xgboost.DMatrix(data = X, label = y)
# many parameters: https://xgboost.readthedocs.io/en/stable/parameter.html#general-parameters 
param = {'max_depth':30, 'eta': 0.35, 'objective':'reg:squarederror'}
xgboost.cv(param, dtrain = data_dmatrix, nfold = 3, metrics = 'rmse', seed = 42)

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3.075875,0.000787,3.075875,0.002966
1,2.003899,0.000535,2.003892,0.002586
2,1.307182,0.000381,1.30718,0.002615
3,0.854576,0.00038,0.854306,0.002388
4,0.560424,0.000389,0.561125,0.002476
5,0.369562,0.000437,0.371688,0.002874
6,0.245772,0.000253,0.250402,0.00217
7,0.165401,0.000402,0.173218,0.001266
8,0.113128,0.000387,0.126364,0.000505
9,0.078627,0.000315,0.099266,0.002146


### Support Vector Machines

In [98]:
from sklearn.svm import LinearSVR

**Linear SVM**

In [101]:
svm_reg = LinearSVR(epsilon = 1.5)
svm_reg.fit(X, y)

LinearSVR(epsilon=1.5)

In [102]:
svm_predictions = svm_reg.predict(X)
svm_mse = mean_squared_error(y, svm_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

1.0932808836333117

In [103]:
# Try cross validation 
scores = cross_val_score(svm_reg, X, y, scoring = "neg_mean_squared_error", cv = 3)
svm_rmse_scores = np.sqrt(-scores)

In [104]:
svm_rmse_scores.mean()

1.0874242875562639

In [105]:
svm_rmse_scores.std()

0.013885973880016009

**SVM Polynomial Kernel**

In [148]:
from sklearn.svm import SVR
svm_reg = SVR(kernel = "poly", degree = 5, C = 5, epsilon = 1)
svm_reg.fit(X, y)

SVR(C=5, degree=5, epsilon=1, kernel='poly')

I tried updating the `degree`, `C`, and `epsilon` parameters but none seem to affect the model fit. I think I need to understand how SVM regression works better. 

In [149]:
svm_predictions = svm_reg.predict(X)
svm_mse = mean_squared_error(y, svm_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03020900002987626

In [141]:
# Try cross validation 
scores = cross_val_score(svm_reg, X, y, scoring = "neg_mean_squared_error", cv = 3)
svm_rmse_scores = np.sqrt(-scores)

In [142]:
svm_rmse_scores.mean()

0.17391681799442305

In [143]:
svm_rmse_scores.std()

0.0046612603986081955

## Summary

It looks like tree-based methods seem the most promising (random forest and GBM). So we will perform hyperparameter tuning on these next. 