In [55]:
import pandas as pd
import numpy as np
import warnings

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [56]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')

In [57]:
### Season to test results

N = 2018

In [58]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < N]

scaler = StandardScaler()
X_train = train.drop(['podium'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
# X_train = np.asarray(X_train)

y_train = np.asarray(train.podium.values)

In [59]:
comparison_dict ={'model':[],
                  'params': [],
                  'score': []}

params score

2021 (gini, 0.8, 27) 0.619048\
2020 (gini, sqrt, 41) 0.705882\
2019 (entropy, 0.8, 43)	0.454545\
2018 (gini, sqrt, 35)	0.523810


In [60]:
def score_classification(model):
    probabilities = []
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['podium'], axis=1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df.reset_index(inplace=True, drop=True)

        probabilities += list(prediction_df['proba_1'].values)

    return probabilities

In [61]:
# Random Forest Classifier

params={'criterion': ['gini', 'entropy'],
        'max_features': [0.8, 'sqrt'],
        'max_depth': list(map(int,(np.linspace(5, 55, 26))))}

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:

            model_params = (criterion, max_features, max_depth)
            model = RandomForestClassifier(criterion = criterion, max_features = max_features, max_depth = max_depth)
            model.fit(X_train, y_train)
            
            probabilities = score_classification(model)

            test = merged[(merged.season == N)]
            test['prob'] = probabilities
            test = test[['round', 'driver', 'prob', 'grid', 'podium']]
            max_prob = test.groupby('round').agg({'prob': max}).reset_index()

            predict_wins = pd.merge(test, max_prob, how='inner', on=['round', 'prob']).drop('prob', axis=1)

            accuracy = (predict_wins.query('podium == 1').podium.count() / predict_wins['round'].count())
            
            comparison_dict['model'].append('random_forest_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(accuracy)

In [62]:
comparison_df = pd.DataFrame(comparison_dict)

comparison_df.sort_values('score', ascending=False)

Unnamed: 0,model,params,score
41,random_forest_classifier,"(gini, sqrt, 35)",0.523810
0,random_forest_classifier,"(gini, 0.8, 5)",0.476190
63,random_forest_classifier,"(entropy, 0.8, 27)",0.476190
26,random_forest_classifier,"(gini, sqrt, 5)",0.476190
27,random_forest_classifier,"(gini, sqrt, 7)",0.476190
...,...,...,...
71,random_forest_classifier,"(entropy, 0.8, 43)",0.380952
74,random_forest_classifier,"(entropy, 0.8, 49)",0.380952
77,random_forest_classifier,"(entropy, 0.8, 55)",0.380952
19,random_forest_classifier,"(gini, 0.8, 43)",0.380952
