In [96]:
import pandas as pd
import numpy as np
import warnings

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, precision_score

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [97]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')

In [98]:
### Season to test results

N = 2021

In [99]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < N]

scaler = StandardScaler()
X_train = train.drop(['podium'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.podium.values)

In [100]:
comparison_dict ={'model':[],
                  'params': [],
                  'score': []}

params score

2022 (entropy, sqrt, 35) 0.523810\
2021 (gini, 0.8, 27) 0.619048\
2020 (gini, sqrt, 41) 0.705882\
2019 (entropy, 0.8, 43)	0.454545\
2018 (gini, sqrt, 35)	0.523810

In [101]:
def score_regression(model):
    correct_predictions = 0
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['podium'], axis=1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['podium'] = y_test.reset_index(drop=True)
        prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x == 1 else 0)
        prediction_df.sort_values('results', inplace=True)
        prediction_df.reset_index(inplace=True, drop=True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        correct_predictions += precision_score(prediction_df.actual, prediction_df.predicted)

    return correct_predictions / df[df.season == N]['round'].nunique()

In [102]:
# Random Forest Regressor

params={'criterion': ['squared_error'],
        'max_features': [0.8, 1.0],
        'max_depth': list(map(int, (np.linspace(5, 55, 26))))
        }

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:

            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion = criterion, max_features = max_features, max_depth = max_depth)
            model.fit(X_train, y_train)
            
            score = score_regression(model)

            # test = merged[(merged.season == N)]
            # test['prob'] = probabilities
            # test = test[['round', 'driver', 'prob', 'grid', 'podium']]
            # max_prob = test.groupby('round').agg({'prob': max}).reset_index()

            # predict_wins = pd.merge(test, max_prob, how='inner', on=['round', 'prob']).drop('prob', axis=1)

            # accuracy = (predict_wins.query('podium == 1').podium.count() / predict_wins['round'].count())
            
            comparison_dict['model'].append('random_forest_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(score)

In [103]:
comparison_df = pd.DataFrame(comparison_dict)

comparison_df.sort_values('score', ascending=False)

Unnamed: 0,model,params,score
23,random_forest_regressor,"(squared_error, 0.8, 51)",0.619048
22,random_forest_regressor,"(squared_error, 0.8, 49)",0.619048
26,random_forest_regressor,"(squared_error, 1.0, 5)",0.571429
16,random_forest_regressor,"(squared_error, 0.8, 37)",0.571429
49,random_forest_regressor,"(squared_error, 1.0, 51)",0.571429
40,random_forest_regressor,"(squared_error, 1.0, 33)",0.571429
37,random_forest_regressor,"(squared_error, 1.0, 27)",0.571429
35,random_forest_regressor,"(squared_error, 1.0, 23)",0.571429
33,random_forest_regressor,"(squared_error, 1.0, 19)",0.571429
28,random_forest_regressor,"(squared_error, 1.0, 9)",0.571429
