# Project Models - CART, Random Forest, Boosting

In [305]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, roc_curve, auc
from sklearn.utils import resample
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [177]:
def avg_error(y_pred, y_test):
    return np.average(abs((y_pred - y_test) / y_test))

In [178]:
def OSR2(model, X_test, y_test, y_train):
    
    y_pred = model.predict(X_test)
    SSE = np.sum((y_test - y_pred)**2)
    SST = np.sum((y_test - np.mean(y_train))**2)
                 
    return (1 - SSE/SST)

In [179]:
def get_metrics(y_predict, y_test):
    cm = confusion_matrix(y_test, y_predict)
    tn, fn, fp, tp = cm[0][0], cm[1][0], cm[0][1], cm[1][1]
    FPR = fp / (fp + tn)
    TPR = tp / (tp + fn)
    acc = (tp + tn) / (tp + tn + fp + fn)
    PRE = 0
    if (tp + fp != 0):
        PRE = tp / (tp + fp)
    return acc, TPR, FPR, PRE

In [176]:
ordered_data = pd.read_csv("../data.csv")
data = train_test_split(ordered_data)
train_data, test_data = data[0], data[1]

## Baseline

In [183]:
base_data = train_test_split(ordered_data[['Unnamed: 0', 'Previous Week Tesla Stock Price', 'Tesla Stock Price']])
base_data_train, base_data_test = base_data[0], base_data[1]

y_train_base, x_train_base = base_data_train['Tesla Stock Price'], base_data_train[['Previous Week Tesla Stock Price', 'Unnamed: 0']]
y_test_base, x_test_base = base_data_test['Tesla Stock Price'], base_data_test[['Previous Week Tesla Stock Price', 'Unnamed: 0']]

ols_base = sm.OLS(y_train_base, x_train_base).fit()
avg_error(ols_base.predict(x_test_base), y_test_base)

0.06746836296956246

## CART

In [352]:
y_train = train_data['Tesla Stock Price']
X_train = pd.get_dummies(train_data.drop(['Tesla Stock Price', 'since', 'until'], axis=1))

y_test = test_data['Tesla Stock Price']
X_test = pd.get_dummies(test_data.drop(['Tesla Stock Price', 'since', 'until'], axis=1))

In [353]:
grid_values = {'ccp_alpha': np.linspace(0, 0.001, 51)}

dtr = DecisionTreeRegressor(min_samples_leaf=5, min_samples_split=20, random_state=88)
cv = KFold(n_splits=5,random_state=1,shuffle=True) 
dtr_cv = GridSearchCV(dtr, param_grid=grid_values, scoring='r2', cv=cv, verbose=0)
dtr_cv.fit(X_train, y_train)
test_pred_cart, train_pred_cart = dtr_cv.predict(X_test), dtr_cv.predict(X_train)

In [354]:
# Model Evaluation
print('Cross-validated R2:', round(dtr_cv.best_score_, 5))
print('OSR2:', round(OSR2(dtr_cv, X_test, y_test, y_train), 5))
print('Average Percent Error: ' + str(avg_error(dtr_cv.predict(X_test), y_test)))

Cross-validated R2: 0.95473
OSR2: 0.95826
Average Percent Error: 0.07925061561883433


## RANDOM FORESTS

In [355]:
grid_values = {'max_features': np.linspace(1,5,5, dtype='int32'),
               'min_samples_leaf': [5],
               'n_estimators': [500],
               'random_state': [88]} 

rf2 = RandomForestRegressor() 
cv = KFold(n_splits=5,random_state=333,shuffle=True) 
rf_cv = GridSearchCV(rf2, param_grid=grid_values, scoring='r2', cv=cv,verbose=2)
rf_cv.fit(X_train, y_train)
test_pred_rf, train_pred_rf = rf_cv.predict(X_test), rf_cv.predict(X_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END max_features=1, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=1, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=1, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=1, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=1, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=2, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=2, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=2, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=2, min_samples_leaf=5, n_estimators=500, random_state=88; total time=   0.6s
[CV] END max_features=2, min_samples_leaf=5, n_es

In [356]:
max_features = rf_cv.cv_results_['param_max_features'].data
R2_scores = rf_cv.cv_results_['mean_test_score']

In [357]:
print('Cross-validated R2:', round(rf_cv.best_score_, 5))
print('OSR2:', round(OSR2(rf_cv, X_test, y_test, y_train), 5))
print('Average Percent Error: ' + str(avg_error(rf_cv.predict(X_test), y_test)))

Cross-validated R2: 0.98535
OSR2: 0.97848
Average Percent Error: 0.05705805720101074


## GRADIENT BOOSTED TREES

In [358]:
reg = GradientBoostingRegressor(random_state=99)
reg.fit(X_train, y_train)
test_pred_reg, train_pred_reg = reg.predict(X_test), reg.predict(X_train)

In [359]:
print('OSR2:', round(OSR2(reg, X_test, y_test, y_train), 5))
print('Average Percent Error: ' + str(avg_error(reg.predict(X_test), y_test)))

OSR2: 0.99081
Average Percent Error: 0.06193615009596899


In [360]:
pd.DataFrame({'Feature' : X_train.columns, 
              'Importance score': 100*reg.feature_importances_}).round(1)

Unnamed: 0,Feature,Importance score
0,Unnamed: 0,68.3
1,S&P 500 Variance,0.0
2,Ford Stock Price,0.1
3,GM Stock Price,0.0
4,Toyota Stock Price,0.2
5,Nissan Stock Price,3.8
6,Tesla Wikipedia Page Views,0.5
7,Sentiment,0.0
8,Previous Week Tesla Stock Price,27.0


## Model Comparison

In [361]:
comparison_data = {'Decision Tree Regressor': ['{:.3f}'.format(OSR2(dtr_cv, X_test, y_test, y_train)),
                                               '{:.4f}'.format(mean_squared_error(y_test, dtr_cv.predict(X_test))),
                                               '{:.3f}'.format(mean_absolute_error(y_test, dtr_cv.predict(X_test)))],
                   'Random Forest': ['{:.3f}'.format(OSR2(rf_cv, X_test, y_test, y_train)),
                                     '{:.4f}'.format(mean_squared_error(y_test, rf_cv.predict(X_test))),
                                     '{:.3f}'.format(mean_absolute_error(y_test, rf_cv.predict(X_test)))], 
                   'Gradient Boosted Trees': ['{:.3f}'.format(OSR2(reg, X_test, y_test, y_train)),
                                              '{:.4f}'.format(mean_squared_error(y_test, reg.predict(X_test))),
                                              '{:.3f}'.format(mean_absolute_error(y_test, reg.predict(X_test)))]}

comparison_table = pd.DataFrame(data=comparison_data, index=['OSR2', 'Out-of-sample MSE', 'Out-of-sample MAE'])
comparison_table.style.set_properties(**{'font-size': '12pt',}).set_table_styles([{'selector': 'th', 'props': [('font-size', '10pt')]}])

Unnamed: 0,Decision Tree Regressor,Random Forest,Gradient Boosted Trees
OSR2,0.958,0.978,0.991
Out-of-sample MSE,140.2876,72.3227,30.8958
Out-of-sample MAE,3.517,2.494,1.788


## Ensemble Model Blending

In [362]:
y_train = train_data['Tesla Stock Price']
train = pd.DataFrame({'Tesla_Stock_Price': y_train, 'val_pred_cart': train_pred_cart, 'val_pred_rf': train_pred_rf, 'val_pred_reg': train_pred_reg})

y_test = test_data['Tesla Stock Price']
test = pd.DataFrame({'Tesla_Stock_Price': y_test, 'val_pred_cart': test_pred_cart, 'val_pred_rf': test_pred_rf, 'val_pred_reg': test_pred_reg})

In [363]:
ensemble_model = smf.ols(formula='Tesla_Stock_Price ~ val_pred_cart+val_pred_reg+val_pred_rf -1', data=train).fit()

In [364]:
print('Average Percent Error: ' + str(avg_error(ensemble_model.predict(test), y_test)))

Average Percent Error: 0.06261219111933379
