In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
from matplotlib import cm

In [2]:
#Load in the data
X = pd.read_csv("../Data/trainX.csv")
y = pd.read_csv("../Data/trainY.csv")

In [3]:
# create a 75/25 stratified split of the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, test_size=0.25)

In [4]:
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (2249, 41) (2249, 1)
Test set: (750, 41) (750, 1)


In [14]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
        pred = fitted_model.predict_proba(X_test_data)
    else:
        pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [20]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

## Random Forest

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
import seaborn as sns

In [27]:
rfr_base_model = RandomForestRegressor(random_state = 42)
rfr_base_model.fit(X_train, y_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [28]:
y_pred = rfr_base_model.predict(X_test)

In [29]:
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
rmse

2.148237538429347

In [30]:
# Calculate the absolute errors
errors = abs(y_pred - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 73.19 %


In [46]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rfr = RandomForestRegressor()
# Instantiate the grid search model
rfr, pred = algorithm_pipeline(X_train, X_test, y_train.values.ravel(), y_test.values, rfr, 
                                 param_grid, cv=5)

# Root Mean Squared Error
print(np.sqrt(-rfr.best_score_))
print(rfr.best_params_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 10.6min finished


2.254025478827444
{'bootstrap': True, 'max_depth': 80, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 1000}


In [47]:
print('Base Model:')
base_accuracy = evaluate(rfr_base_model, X_test, y_test.values)
print()
print('Model after Tuning:')
rfr_best_model = rfr.best_estimator_
best_accuracy = evaluate(rfr_best_model, X_test, y_test.values)

print('Improvement of {:0.2f}%.'.format( 100 * (best_accuracy - base_accuracy) / base_accuracy))

Base Model:
Average Error: 2.8428 degrees.
Accuracy = 73.19%.

Model after Tuning:
Average Error: 2.6522 degrees.
Accuracy = 74.31%.
Improvement of 1.54%.


Base Model:
Model Performance
Average Error: 2.8720 degrees.
Accuracy = 73.01%.

Model after Tuning:
Model Performance
Average Error: 2.6519 degrees.
Accuracy = 74.33%.
Improvement of 1.81%.

## Extra Trees

In [48]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std

In [49]:
et_base_model = ExtraTreesRegressor()
et_base_model.fit(X_train, y_train.values.ravel());

In [50]:
y_pred = et_base_model.predict(X_test)

In [51]:

# Calculate the absolute errors
errors = abs(y_pred - y_test.values)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2.81


In [52]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 73.38 %.


In [76]:
et_model = ExtraTreesRegressor()
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

et_model, pred = algorithm_pipeline(X_train, X_test, y_train.values.ravel(), y_test.values, et_model, 
                                 param_grid, cv=5)

# Root Mean Squared Error
print(np.sqrt(-et_model.best_score_))
print(et_model.best_params_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  8.8min finished


2.3428684167217635
{'bootstrap': True, 'max_depth': 110, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}


In [77]:
base_model = ExtraTreesRegressor()
base_model.fit(X_train, y_train.values.ravel())
base_accuracy = evaluate(base_model, X_test, y_test.values)

et_best_model = et_model.best_estimator_
best_accuracy = evaluate(et_best_model, X_test, y_test.values)


print('Improvement of {:0.2f}%.'.format( 100 * (best_accuracy - base_accuracy) / base_accuracy))

Average Error: 2.8068 degrees.
Accuracy = 73.39%.
Average Error: 2.5956 degrees.
Accuracy = 74.66%.
Improvement of 1.73%.


Model Performance
Average Error: 2.8057 degrees.
Accuracy = 73.40%.
Model Performance
Average Error: 2.5961 degrees.
Accuracy = 74.66%.
Improvement of 1.71%.

## XGBoost

In [42]:
from numpy import loadtxt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

In [43]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [44]:
xgb_base_model = xgb.XGBRegressor()
xgb_base_model.fit(X_train, y_train.values.ravel())

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [60]:
xgb_model = xgb.XGBRegressor()
param_grid = {
    'n_estimators': [100, 200, 400],
    'colsample_bytree': [1.0, 1.1, 1.5],
    'max_depth': [1, 5, 100],
    'reg_alpha': [1.2, 1.3, 1.4],
    'reg_lambda': [ 1.2, 1.3],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_model, pred = algorithm_pipeline(X_train, X_test, y_train.values.ravel(), y_test.values, xgb_model, 
                                 param_grid, cv=5)

# Root Mean Squared Error
print(np.sqrt(-xgb_model.best_score_))
print(xgb_model.best_params_)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 2080 tasks      | elapsed:  7.8min


2.1575986325294387
{'colsample_bytree': 1.0, 'max_depth': 1, 'n_estimators': 200, 'reg_alpha': 1.3, 'reg_lambda': 1.2, 'subsample': 0.9}


[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed:  7.8min finished


In [61]:
print('Base Model:')
base_accuracy = evaluate(xgb_base_model, X_test, y_test.values)
print()
print('Model after Tuning:')
xgb_best_model = xgb_model.best_estimator_
best_accuracy = evaluate(xgb_best_model, X_test, y_test.values)


print('Improvement of {:0.2f}%.'.format( 100 * (best_accuracy - base_accuracy) / base_accuracy))


Base Model:
Average Error: 2.9210 degrees.
Accuracy = 72.58%.

Model after Tuning:
Average Error: 2.8031 degrees.
Accuracy = 73.34%.
Improvement of 1.04%.


## LightGBM

In [62]:
# lightgbm for regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot

# evaluate the model
lgbm_base_model = LGBMRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
lgbm_base_model = LGBMRegressor()
lgbm_base_model.fit(X_train, y_train)
# make a single prediction
row = [[2.02220122, 0.31563495, 0.82797464, -0.30620401, 0.16003707, -1.44411381, 0.87616892, -0.50446586, 0.23009474, 0.76201118]]
yhat = lgbm_base_model.predict(X_test)
print('Prediction: %.3f' % yhat[0])

MAE: -1.416 (0.116)
Prediction: 13.543


In [64]:
mse = mean_squared_error(y_test,yhat)
rmse = np.sqrt(mse)
rmse

2.1904772793411964

In [65]:
# Calculate the absolute errors
errors = abs(yhat - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 72.92 %


In [68]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [100, 150],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    'max_depth': [1, 2, 5],
    'reg_alpha': [1.0, 1.1, 1.2],
    'reg_lambda': [1.2, 1.3],
    'subsample': [0.3,0.35, 0.4]

}
# Create a based model
lgbm = LGBMRegressor()
lgbm, pred = algorithm_pipeline(X_train, X_test, y_train.values.ravel(), y_test.values, lgbm, 
                                 param_grid, cv=5)

# Root Mean Squared Error
print(np.sqrt(-lgbm.best_score_))
print(lgbm.best_params_)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 706 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 1272 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 2002 tasks      | elapsed:   56.3s


2.127612202661131
{'colsample_bytree': 0.7, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 1.1, 'reg_lambda': 1.2, 'subsample': 0.3}


[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:  1.1min finished


In [69]:
lgbm_base_model = LGBMRegressor()
lgbm_base_model.fit(X_train, y_train.values.ravel())
base_accuracy = evaluate(lgbm_base_model, X_test, y_test.values)

lgbm_best_model = lgbm.best_estimator_
best_accuracy = evaluate(lgbm_best_model, X_test, y_test.values)


print('Improvement of {:0.2f}%.'.format( 100 * (best_accuracy - base_accuracy) / base_accuracy))

Average Error: 2.8664 degrees.
Accuracy = 72.92%.
Average Error: 2.8347 degrees.
Accuracy = 73.13%.
Improvement of 0.29%.


Model Performance
Average Error: 2.8664 degrees.
Accuracy = 72.92%.
Model Performance
Average Error: 2.8347 degrees.
Accuracy = 73.13%.
Improvement of 0.29%.