In [217]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
from matplotlib import cm

In [218]:
#Load in the data
X = pd.read_csv("../Data/trainX.csv")
y = pd.read_csv("../Data/trainY.csv")

In [219]:
# create a 75/25 stratified split of the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, test_size=0.25)

In [220]:
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (2249, 44) (2249, 1)
Test set: (750, 44) (750, 1)


In [None]:
# This is the started point you need and can input into models from here, 
# let me know if you have any questions or need any explanation/help!

In [221]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
        pred = fitted_model.predict_proba(X_test_data)
    else:
        pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,make_scorer
from sklearn.ensemble import RandomForestRegressor
from hyperopt import tpe,hp,Trials
from hyperopt.fmin import fmin

In [20]:
import xgboost as xgb
from sklearn.metrics import r2_score,mean_squared_error
Xgb=xgb.XGBRegressor(random_state=1)
Xgb.fit(X_train,y_train.values.ravel())
pred_xgb=Xgb.predict(X_test)
score_xgb=mean_squared_error(y_test.values,pred_xgb)
print(score_xgb)

4.867200796560903


In [21]:
# Calculate the absolute errors
errors = abs(pred_xgb - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 72.58 %


In [26]:
seed=5
def objective2(params):
    est=int(params['n_estimators'])
    md=int(params['max_depth'])
    learning=params['learning_rate']
    
    
    model=xgb.XGBRegressor(n_estimators=est,max_depth=md,learning_rate=learning)
    model.fit(X_train,y_train.values.ravel())
    pred=model.predict(X_test)
    score=mean_squared_error(y_test.values,pred)
    return score

def optimize2(trial):
    params={'n_estimators':hp.uniform('n_estimators',100,500),
           'max_depth':hp.uniform('max_depth',5,20),
           'learning_rate':hp.uniform('learning_rate',0.01,0.1)}
    best2=fmin(fn=objective2,space=params,algo=tpe.suggest,trials=trial,max_evals=500,rstate=np.random.RandomState(seed))
    return best2

trial2=Trials()
best2=optimize2(trial2)

100%|██████████| 500/500 [12:34<00:00,  1.51s/trial, best loss: 4.389163945541297]


In [27]:
best2

{'learning_rate': 0.03794177053324573,
 'max_depth': 5.672380425486769,
 'n_estimators': 157.75615956646732}

In [28]:
xgb_opt=xgb.XGBRegressor(n_estimators=214,max_depth=10,learning_rate=0.042467)
xgb_opt.fit(X_train,y_train.values.ravel())
pred_xgb_opt=xgb_opt.predict(X_test)
score_xgb_opt=mean_squared_error(y_test,pred_xgb_opt)
print(score_xgb_opt)

4.601330729994544


In [29]:
# Calculate the absolute errors
errors = abs(pred_xgb_opt - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 72.94 %


In [33]:
rfr=RandomForestRegressor(random_state=1)
rfr.fit(X_train,y_train.values.ravel())
pred_rfr=rfr.predict(X_test)
score_rfr=mean_squared_error(pred_rfr,y_test.values)
print(score_rfr)

4.568202727217344


In [34]:
# Calculate the absolute errors
errors = abs(pred_rfr - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 73.24 %


In [35]:
seed=2
def objective(params):
    est=int(params['n_estimators'])
    md=int(params['max_depth'])
    msl=int(params['min_samples_leaf'])
    mss=int(params['min_samples_split'])
    model=RandomForestRegressor(n_estimators=est,max_depth=md,min_samples_leaf=msl,min_samples_split=mss)
    model.fit(X_train,y_train.values.ravel())
    pred=model.predict(X_test)
    score=mean_squared_error(y_test.values,pred)
    return score

def optimize(trial):
    params={'n_estimators':hp.uniform('n_estimators',100,500),
           'max_depth':hp.uniform('max_depth',5,20),
           'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
           'min_samples_split':hp.uniform('min_samples_split',2,6)}
    best=fmin(fn=objective,space=params,algo=tpe.suggest,trials=trial,max_evals=500,rstate=np.random.RandomState(seed))
    return best

trial=Trials()
best=optimize(trial)

100%|██████████| 500/500 [45:48<00:00,  5.50s/trial, best loss: 4.430056804266973]  


In [36]:
best

{'max_depth': 11.54615400621966,
 'min_samples_leaf': 1.3536205196725086,
 'min_samples_split': 2.87139419158255,
 'n_estimators': 115.45548646923564}

In [45]:
rfr_opt=RandomForestRegressor(n_estimators=115,max_depth=11,min_samples_split=2,min_samples_leaf=1)
rfr_opt.fit(X_train,y_train.values.ravel())
pred_rfr_opt=rfr_opt.predict(X_test)
score_rfr_opt=mean_squared_error(y_test.values,pred_rfr_opt)
print(score_rfr_opt)

4.441481538063418


In [46]:
# Calculate the absolute errors
errors = abs(pred_rfr_opt - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 73.27 %


## Random Forest

In [222]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
import seaborn as sns

In [223]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [224]:
rfr = RandomForestRegressor(n_estimators = 100)
rfr.fit(X_train,y_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [225]:
y_pred = rfr.predict(X_test)
y_pred[:20]

array([12.65271692, 13.81703962, 15.01293147, 15.63946223, 14.7063291 ,
       17.3729927 , 17.80834941, 17.8710349 , 14.69183986, 13.77477398,
       15.4627371 , 17.77228809, 16.50681529, 17.42377862, 17.17158779,
       14.54333621, 17.95255732, 13.82473269, 17.39537305, 18.15559177])

In [226]:
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
rmse

2.1397844829276282

In [227]:
# Calculate the absolute errors
errors = abs(y_pred - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 73.18 %


In [228]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train.values.ravel())
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min


In [None]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train.values.ravel())
base_accuracy = evaluate(base_model, X_test, y_test.values)

best_model = grid_search.best_estimator_
best_accuracy = evaluate(best_model, X_test, y_test.values)

print('Improvement of {:0.2f}%.'.format( 100 * (best_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 2.8720 degrees.
Accuracy = 73.01%.
Model Performance
Average Error: 2.6513 degrees.
Accuracy = 74.30%.
Improvement of 1.78%.

## Extra Trees

In [131]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std

In [None]:
base_model = ExtraTreesRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(base_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
base_model.fit(X_train, y_train.values.ravel());

In [None]:
# Use the forest's predict method on the test data
predictions = base_model.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test.values)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
et = ExtraTreesRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = et, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test.values)

base_accuracy = evaluate(base_model, X_test, y_test.values)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

In [None]:
best_grid.feature_importances_

In [None]:
model = ExtraTreesRegressor()
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
                                 param_grid, cv=5)

# Root Mean Squared Error
print(np.sqrt(-model.best_score_))
print(model.best_params_)

In [None]:
base_model = ExtraTreesRegressor()
base_model.fit(X_train, y_train.values.ravel())
base_accuracy = evaluate(base_model, X_test, y_test.values)

best_grid = model.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test.values)


print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 2.8057 degrees.
Accuracy = 73.40%.
Model Performance
Average Error: 2.5961 degrees.
Accuracy = 74.66%.
Improvement of 1.71%.

## XGBoost

In [201]:
from numpy import loadtxt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

In [202]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [203]:
mean_train = np.mean(y_train.values)
# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train
# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))

Baseline MAE is 2.25


In [204]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [205]:
params['eval_metric'] = "mae"
num_boost_round = 999

In [206]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-mae:10.81115
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:7.63496
[2]	Test-mae:5.43559
[3]	Test-mae:3.95736
[4]	Test-mae:2.97868
[5]	Test-mae:2.36992
[6]	Test-mae:1.97631
[7]	Test-mae:1.73865
[8]	Test-mae:1.60013
[9]	Test-mae:1.52662
[10]	Test-mae:1.48291
[11]	Test-mae:1.46120
[12]	Test-mae:1.45575
[13]	Test-mae:1.45888
[14]	Test-mae:1.45477
[15]	Test-mae:1.45569
[16]	Test-mae:1.44989
[17]	Test-mae:1.44719
[18]	Test-mae:1.44699
[19]	Test-mae:1.44974
[20]	Test-mae:1.44439
[21]	Test-mae:1.44673
[22]	Test-mae:1.44293
[23]	Test-mae:1.44529
[24]	Test-mae:1.44520
[25]	Test-mae:1.44413
[26]	Test-mae:1.44855
[27]	Test-mae:1.44819
[28]	Test-mae:1.44216
[29]	Test-mae:1.44218
[30]	Test-mae:1.44308
[31]	Test-mae:1.43975
[32]	Test-mae:1.43715
[33]	Test-mae:1.43600
[34]	Test-mae:1.43859
[35]	Test-mae:1.43994
[36]	Test-mae:1.44010
[37]	Test-mae:1.43969
[38]	Test-mae:1.43891
[39]	Test-mae:1.43759
[40]	Test-mae:1.44420
[41]	Test-mae:1.44620
[42]	Test-mae:1.44615
[43]	Te

In [207]:
y_pred = model.predict(dtest)
# Calculate the absolute errors
errors = abs(y_pred - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 72.73 %


In [None]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

In [None]:
cv_results['test-mae-mean'].min()

In [None]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [None]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
params['max_depth'] = 9
params['min_child_weight'] = 7

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [None]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
params['subsample'] = .9
params['colsample_bytree'] = 0.7

In [None]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv( params,dtrain, num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['mae'],early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

In [None]:
params['eta'] = .01

In [None]:
params

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [None]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
mean_absolute_error(best_model.predict(dtest), y_test)

In [None]:
y_pred = model.predict(dtest)
# Calculate the absolute errors
errors = abs(y_pred - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [118]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [119]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [120]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 5.885185


In [None]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results.head()

In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

In [None]:
import matplotlib.pyplot as plt

xgb.plot_tree(xg_reg,num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()

In [None]:
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [10, 10]
plt.show()

In [212]:
model = xgb.XGBRegressor()
param_grid = {
    'n_estimators': [100, 200, 400],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'max_depth': [1, 5, 10, 15],
    'reg_alpha': [1.2, 1.3, 1.4, 1.5],
    'reg_lambda': [ 1.2, 1.3],
    'subsample': [0.9, 1.0]
}

model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
                                 param_grid, cv=5)

# Root Mean Squared Error
print(np.sqrt(-model.best_score_))
print(model.best_params_)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.5s


KeyboardInterrupt: 

In [None]:
base_model = xgb.XGBRegressor()
base_model.fit(X_train, y_train.values.ravel())
base_accuracy = evaluate(base_model, X_test, y_test.values)

best_grid = model.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test.values)


print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

In [None]:
Model Performance
Average Error: 2.9210 degrees.
Accuracy = 72.58%.
Model Performance
Average Error: 2.8031 degrees.
Accuracy = 73.34%.
Improvement of 1.04%.

## Gradient Boosting

In [135]:
# gradient boosting for regression in scikit-learn
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot
# evaluate the model
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
yhat = model.predict(X_test)
print('Prediction: %.3f' % yhat[0])

MAE: -1.414 (0.116)


  y = column_or_1d(y, warn=True)


Prediction: 13.644


In [136]:
# Calculate the absolute errors
errors = abs(yhat - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 73.25 %.


In [137]:
from sklearn.experimental import enable_hist_gradient_boosting

In [138]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot

# evaluate the model
model = HistGradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

yhat = model.predict(X_test)
print('Prediction: %.3f' % yhat[0])

MAE: -1.425 (0.113)


  y = column_or_1d(y, warn=True)


Prediction: 13.551


In [139]:
# Calculate the absolute errors
errors = abs(yhat - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 72.88 %.


In [146]:
model = HistGradientBoostingRegressor()
model.get_params()

{'l2_regularization': 0.0,
 'learning_rate': 0.1,
 'loss': 'least_squares',
 'max_bins': 255,
 'max_depth': None,
 'max_iter': 100,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 20,
 'n_iter_no_change': None,
 'random_state': None,
 'scoring': None,
 'tol': 1e-07,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

## LightGBM

In [61]:
# lightgbm for regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot

# evaluate the model
model = LGBMRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = LGBMRegressor()
model.fit(X_train, y_train)
# make a single prediction
row = [[2.02220122, 0.31563495, 0.82797464, -0.30620401, 0.16003707, -1.44411381, 0.87616892, -0.50446586, 0.23009474, 0.76201118]]
yhat = model.predict(X_test)
print('Prediction: %.3f' % yhat[0])

MAE: -1.416 (0.116)
Prediction: 13.543


In [64]:
model = LGBMRegressor()
model

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [62]:
mse = mean_squared_error(y_test,yhat)
rmse = np.sqrt(mse)
rmse

2.1904772793411964

In [63]:
# Calculate the absolute errors
errors = abs(yhat - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 72.92 %


In [171]:
lgbm = LGBMRegressor()
lgbm

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [189]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [100, 200, 400],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    'max_depth': [0, 5,],
    'reg_alpha': [0.8, 0.9, 1.0, 1.1],
    'reg_lambda': [1.1, 1.2],
    'subsample': [0.4, 0.5, 0.7,  0.8, 0.9]

}
# Create a based model
lgbm = LGBMRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = lgbm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [190]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train.values.ravel())

grid_search.best_params_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   11.2s finished


{'colsample_bytree': 0.7,
 'max_depth': 5,
 'n_estimators': 100,
 'reg_alpha': 1.1,
 'reg_lambda': 1.2,
 'subsample': 0.5}

In [191]:
best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train.values.ravel())

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
              importance_type='split', learning_rate=0.1, max_depth=5,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=1.1, reg_lambda=1.2, silent=True,
              subsample=0.5, subsample_for_bin=200000, subsample_freq=0)

In [192]:
base_model = LGBMRegressor()
base_model.fit(X_train, y_train.values.ravel())
base_accuracy = evaluate(base_model, X_test, y_test.values)

best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test.values)


print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 2.8664 degrees.
Accuracy = 72.92%.
Model Performance
Average Error: 2.8347 degrees.
Accuracy = 73.13%.
Improvement of 0.29%.


In [None]:
Model Performance
Average Error: 2.8664 degrees.
Accuracy = 72.92%.
Model Performance
Average Error: 2.8347 degrees.
Accuracy = 73.13%.
Improvement of 0.29%.

## CatBoost

In [74]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot

# evaluate the model
model = CatBoostRegressor(verbose=0, n_estimators=100)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = CatBoostRegressor(verbose=0, n_estimators=100)
model.fit(X_train, y_train)


yhat = model.predict(X_test)
print('Prediction: %.3f' % yhat[0])

MAE: -1.412 (0.128)
Prediction: 13.673


In [75]:
mse = mean_squared_error(y_test,yhat)
rmse = np.sqrt(mse)
rmse

2.137768244044379

In [76]:
# Calculate the absolute errors
errors = abs(yhat - y_test.values)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

Accuracy: 72.95 %


AttributeError: 'CatBoostRegressor' object has no attribute 'params'