In [51]:
import numpy as np
import pickle
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import ElasticNet, SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from load_data import load_and_split_data
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
data_slices = load_and_split_data("ENB2012_data.xlsx")
X_train = data_slices[0]
X_test = data_slices[1]
Y1_train = data_slices[2]
Y1_test = data_slices[3]
Y2_train = data_slices[4]
Y2_test = data_slices[5]

Y_train = pd.concat([Y1_train, Y2_train], axis=1) 
Y_test = pd.concat([Y1_test, Y2_test], axis=1)

#### Multitarget regression trees

In [58]:
kfolds = 5
scaler = StandardScaler()
scoring = 'neg_mean_squared_error'
model_name = "DecisionTreeRegressor".lower()
# data is already centered, do not need intercept term
pipeline = make_pipeline(StandardScaler(), DecisionTreeRegressor())

param_grid={
            model_name+'__max_depth': np.arange(2, 12, 1),
            model_name+'__min_samples_split': np.arange(2, 12, 1),
            model_name+'__min_samples_leaf': np.arange(1, 12, 1),
            
}
grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=kfolds)

grid_search.fit(X_train, Y_train)
print(grid_search.best_score_)
grid_search.best_params_


-2.147313952599276


{'decisiontreeregressor__max_depth': 7,
 'decisiontreeregressor__min_samples_leaf': 5,
 'decisiontreeregressor__min_samples_split': 6}

In [59]:
model = grid_search.best_estimator_

decisiontree_train_mse = mean_squared_error(model.predict(X_train), Y_train)
decisiontree_test_mse = mean_squared_error(model.predict(X_test), Y_test)
print(decisiontree_train_mse)
print(decisiontree_test_mse)

1.4754754957735168
1.7452531402087217


#### ExtraTreeRegessor

In [49]:
kfolds = 5
scaler = StandardScaler()
scoring = 'neg_mean_squared_error'
model_name = "ExtraTreeRegressor".lower()
# data is already centered, do not need intercept term
pipeline = make_pipeline(StandardScaler(), ExtraTreeRegressor())

param_grid={
            model_name+'__max_depth': np.arange(2, 12, 1),
            model_name+'__min_samples_split': np.arange(2, 12, 1),
            model_name+'__min_samples_leaf': np.arange(1, 12, 1),
            
}
grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=kfolds)

grid_search.fit(X_train, Y_train)
print(grid_search.best_score_)
grid_search.best_params_

-2.0379360109254794


{'extratreeregressor__max_depth': 7,
 'extratreeregressor__min_samples_leaf': 2,
 'extratreeregressor__min_samples_split': 7}

In [50]:
extratree_model = grid_search.best_estimator_

extratree_train_mse = mean_squared_error(extratree_model.predict(X_train), Y_train)
extratree_test_mse = mean_squared_error(extratree_model.predict(X_test), Y_test)
print(extratree_train_mse)
print(extratree_test_mse)

1.6278476057645384
1.8239086909902256


##### RandomForest

In [52]:
kfolds = 5
scaler = StandardScaler()
scoring = 'neg_mean_squared_error'
model_name = "RandomForestRegressor".lower()
# data is already centered, do not need intercept term
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

param_grid={
            model_name+'__max_depth': np.arange(2, 8, 1),
            model_name+'__min_samples_split': np.arange(2, 8, 1),
#             model_name+'__min_samples_leaf': np.arange(1, 8, 1),
            
}
grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=kfolds)

grid_search.fit(X_train, Y_train)
print(grid_search.best_score_)
grid_search.best_params_

-1.969110008910129


{'randomforestregressor__max_depth': 6,
 'randomforestregressor__min_samples_split': 3}

In [41]:
randomforest_model = grid_search.best_estimator_

randomforest_train_mse = mean_squared_error(randomforest_model.predict(X_train), Y_train)
randomforest_test_mse = mean_squared_error(randomforest_model.predict(X_test), Y_test)
print(randomforest_train_mse)
print(randomforest_test_mse)

1.3116021685324841
1.5092382933334154


In [53]:
# train random forest on all data to export to server

final_pipeline = pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(max_depth=6, min_samples_split=3))
df = pd.read_excel("ENB2012_data.xlsx")
# shuffle data 
df = df.sample(frac=1)
train_cols = list(df.columns)[:-2]
test_cols = list(df.columns)[-2:]
X = df[train_cols]
Y = df[test_cols]

final_pipeline.fit(X, Y)
with open("randomforrest.p", "wb") as f:
    pickle.dump(final_pipeline, f)

In [57]:
final_pipeline.predict([[1,2,3,4,5,6,7,8]])

array([[27.69249294, 29.16560986]])