In [134]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import ensemble
import tqdm
import pandas as pd
import numpy as np
from sklearn import tree
from matplotlib import pyplot as plt

**Data manipulation**

In [121]:
df = pd.read_csv('/GT_stage_data.csv')
df['climbs'] = df['cat_4_climb'] + df['cat_3_climb'] + df['HC_climb']
df = df.drop(columns=['cat_4_climb', 'cat_3_climb', 'cat_2_climb', 'cat_1_climb', 'HC_climb'],)

**Model**

In [122]:
X = df.iloc[:, [0, 1, 2, 3, 4, 6]]
y = df.iloc[:, 5].values

X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

X_train, X_val, y_train, y_val   = train_test_split(X_train,
                                                   y_train,
                                                   test_size=0.2,
                                                   random_state=42)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)


(108, 6) (27, 6) (34, 6) (108,) (27,) (34,)


Use the data (remember to split your data into a train, validation, and test data). Using your training and validation data, optimize the parameters of your RF. How well does your optimized model perform on the test data?

Let us start by ensuring we can just run an RF without any optimization.

In [123]:
rf_current = ensemble.RandomForestRegressor()
rf_current.fit(X_train, y_train)
y_val_hat = rf_current.predict(X_val)
mse = mean_squared_error(y_val, y_val_hat)

print(f'RF with default settings has validation MSE of {mse}.')


RF with default settings has validation MSE of 0.25234814814814815.


In [124]:
n_estimators_list = [2, 4, 6]
min_samples_split_list = [5, 8, 10]
min_samples_leaf_list = [6, 9, 11]

results = []

for n_estimators in n_estimators_list:
    for min_samples_split in min_samples_split_list:
        for min_samples_leaf in min_samples_leaf_list:
            rf_current = ensemble.RandomForestRegressor(
                n_estimators=n_estimators,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                )
            rf_current.fit(X_train, y_train)
            y_val_hat = rf_current.predict(X_val)
            mse = mean_squared_error(y_val, y_val_hat)

            results.append([mse, n_estimators, min_samples_split, min_samples_leaf])

results = pd.DataFrame(results)
results.columns = ['MSE', 'n_estimators', 'min_samples_split', 'min_samples_leaf']
print(results)


         MSE  n_estimators  min_samples_split  min_samples_leaf
0   0.260994             2                  5                 6
1   0.207062             2                  5                 9
2   0.195636             2                  5                11
3   0.243083             2                  8                 6
4   0.252614             2                  8                 9
5   0.290898             2                  8                11
6   0.217411             2                 10                 6
7   0.193917             2                 10                 9
8   0.213867             2                 10                11
9   0.236886             4                  5                 6
10  0.215538             4                  5                 9
11  0.211549             4                  5                11
12  0.252981             4                  8                 6
13  0.209115             4                  8                 9
14  0.220743             4              

In [130]:
min_idx = results['MSE'].idxmin()
n_estimators = int(results.loc[min_idx]['n_estimators'])
min_samples_split = int(results.loc[min_idx]['min_samples_split'])
min_samples_leaf = int(results.loc[min_idx]['min_samples_leaf'])

In [132]:
X_test_val_combined = np.concatenate([X_test, X_val])
y_test_val_combined = np.concatenate([y_test, y_val])

rf_current = ensemble.RandomForestRegressor(
                n_estimators=n_estimators,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                )

rf_current.fit(X_train, y_train)
y_test_hat = rf_current.predict(X_test_val_combine)
mse = mean_squared_error(y_test_val_combined, y_test_hat)

print(f'RF with default settings has validation MSE of {mse}.')

RF with default settings has validation MSE of 0.2577940626747744.




In [136]:
importances = rf_current.feature_importances_
names = X_train.columns

feature_importance = pd.DataFrame(zip(names, importances),
                                  columns=['Feature', 'Importance'])
feature_importance = feature_importance.sort_values('Importance', ascending=False).reset_index(drop=True)

print(feature_importance[:6])



           Feature  Importance
0    Elevation_max    0.547015
1    Elevation_min    0.139200
2    End_Elevation    0.110687
3           climbs    0.110551
4  Start_Elevation    0.083590
5      Distance_Km    0.008956
