In [44]:
from sklearn.model_selection import train_test_split
from sklearn import ensemble
import tqdm
import pandas as pd
import numpy as np
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


**Data manipulation**

In [51]:
df = pd.read_csv('/content/GT_stage_data.csv')
df['climbs'] = df['cat_4_climb'] + df['cat_3_climb'] + df['HC_climb']
df = df.drop(columns=['cat_4_climb', 'cat_3_climb', 'cat_2_climb', 'cat_1_climb', 'HC_climb'],)
df.head(1)


Unnamed: 0,Distance_Km,Start_Elevation,End_Elevation,Elevation_min,Elevation_max,Breakaway_win,climbs
0,162.5,101,55,24.0,245,0,2


**Model**

In [53]:
X = df.iloc[:, [0, 1, 2, 3, 4, 6]]
y = df.iloc[:, 5].values

X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

X_train, X_val, y_train, y_val   = train_test_split(X_train,
                                                   y_train,
                                                   test_size=0.2,
                                                   random_state=42)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

(108, 6) (27, 6) (34, 6) (108,) (27,) (34,)


Use the data (remember to split your data into a train, validation, and test data). Using your training and validation data, optimize the parameters of your RF. How well does your optimized model perform on the test data?

Let us start by ensuring we can just run an RF without any optimization.

In [50]:
rf_current = ensemble.RandomForestClassifier()
rf_current.fit(X_train, y_train)
y_val_hat = rf_current.predict(X_val)

accuracy_default = accuracy_score(y_val_hat, y_val)

print(f'DT with default settings achieved {round(accuracy_default * 100, 1)}% accuracy.')


DT with default settings achieved 59.3% accuracy.


In [54]:
n_estimators_list = [100, 500, 1000]
min_samples_split_list = [5, 8, 10]
min_samples_leaf_list = [6, 9, 11]

results = []

for n_estimators in n_estimators_list:
    for min_samples_split in min_samples_split_list:
        for min_samples_leaf in min_samples_leaf_list:
            rf_current = ensemble.RandomForestClassifier(
                n_estimators=n_estimators,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                )
            rf_current.fit(X_train, y_train)
            y_val_hat = rf_current.predict(X_val)
            accuracy = accuracy_score(y_val_hat, y_val)
            results.append([accuracy, n_estimators, min_samples_split, min_samples_leaf])

results = pd.DataFrame(results)
results.columns = ['accuracy', 'n_estimators', 'min_samples_split', 'min_samples_leaf']
print(results)


    accuracy  n_estimators  min_samples_split  min_samples_leaf
0   0.592593           100                  5                 6
1   0.629630           100                  5                 9
2   0.666667           100                  5                11
3   0.703704           100                  8                 6
4   0.666667           100                  8                 9
5   0.666667           100                  8                11
6   0.629630           100                 10                 6
7   0.666667           100                 10                 9
8   0.629630           100                 10                11
9   0.592593           500                  5                 6
10  0.666667           500                  5                 9
11  0.629630           500                  5                11
12  0.592593           500                  8                 6
13  0.666667           500                  8                 9
14  0.629630           500              

In [55]:
max_idx = results['accuracy'].idxmax()
n_estimators = int(results.loc[max_idx]['n_estimators'])
min_samples_split = int(results.loc[max_idx]['min_samples_split'])
min_samples_leaf = int(results.loc[max_idx]['min_samples_leaf'])

In [57]:
X_test_val_combined = np.concatenate([X_test, X_val])
y_test_val_combined = np.concatenate([y_test, y_val])

rf_current = ensemble.RandomForestClassifier(
                n_estimators=n_estimators,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                )

rf_current.fit(X_train, y_train)
y_test_hat = rf_current.predict(X_test_val_combined)
accuracy = accuracy_score(y_test_val_combined, y_test_hat)

print(f'DT with default settings achieved {round(accuracy * 100, 1)}% accuracy.')

DT with default settings achieved 63.9% accuracy.




In [58]:
importances = rf_current.feature_importances_
names = X_train.columns

feature_importance = pd.DataFrame(zip(names, importances),
                                  columns=['Feature', 'Importance'])
feature_importance = feature_importance.sort_values('Importance', ascending=False).reset_index(drop=True)

print(feature_importance[:6])


           Feature  Importance
0    Elevation_max    0.356145
1    End_Elevation    0.253556
2    Elevation_min    0.168845
3  Start_Elevation    0.119543
4      Distance_Km    0.076238
5           climbs    0.025673
