In [92]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'
    # path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [93]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')

In [94]:
### Season to test results

N = 2021

In [95]:
df = data.copy()

train = df[df.season < N]

scaler = StandardScaler()

X_train = train.drop(['season', 'round', 'podium', 'driver_points_from_race', 'constructor_points_from_race', 'qualifying_pos'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.driver_points_from_race.values)

In [96]:
X_train

Unnamed: 0,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,starting_grid,driver_adrian_sutil,driver_alexander_albon,driver_alexander_rossi,...,circuit_id_rodriguez,circuit_id_sepang,circuit_id_shanghai,circuit_id_silverstone,circuit_id_sochi,circuit_id_spa,circuit_id_suzuka,circuit_id_villeneuve,circuit_id_yas_marina,circuit_id_zandvoort
0,-0.679535,-0.314453,-1.595994,-0.693558,-0.361653,-1.697284,-1.294514,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,-0.230289,0.0
1,-0.679535,-0.314453,-1.595994,-0.693558,-0.361653,-1.697284,-1.122576,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,-0.230289,0.0
2,-0.679535,-0.314453,-1.595994,-0.693558,-0.361653,-1.697284,-0.090952,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,-0.230289,0.0
3,-0.679535,-0.314453,-1.595994,-0.693558,-0.361653,-1.697284,-0.950639,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,-0.230289,0.0
4,-0.679535,-0.314453,-1.595994,-0.693558,-0.361653,-1.697284,0.768734,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,-0.230289,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2755,-0.621236,-0.314453,1.067971,-0.634057,-0.361653,0.856971,0.596797,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,4.342363,0.0
2756,-0.679535,-0.314453,1.694786,-0.693558,-0.361653,1.495535,1.284546,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,4.342363,0.0
2757,-0.664960,-0.314453,1.538082,-0.671245,-0.361653,1.176253,1.628421,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,4.342363,0.0
2758,-0.679535,-0.314453,2.008193,-0.671245,-0.361653,1.176253,1.112609,-0.083257,-0.116567,-0.042601,...,-0.191871,-0.174985,-0.21227,-0.248069,-0.232895,-0.232895,-0.214128,-0.21227,4.342363,0.0


In [97]:
def get_predictions(X_train, y_train, model):
    points = []
    winners = []
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['season','round','podium', 'driver_points_from_race', 'constructor_points_from_race', 'qualifying_pos'], axis=1)
        y_test = test.driver_points_from_race

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns=['predicted_points'])
        prediction_df['actual_points'] = y_test.reset_index(drop=True)

        prediction_df['predicted_winner'] = prediction_df.predicted_points.map(lambda x: 1 if x == prediction_df.predicted_points.max() else 0)
        prediction_df['actual_winner'] = prediction_df.actual_points.map(lambda x: 1 if x == prediction_df.actual_points.max() else 0)

        points += list(prediction_df.predicted_winner.values)
        winners += list(prediction_df.predicted_points.values)

        # Retrain model
        X_train = pd.concat([X_train, X_test])
        y_train = np.append(y_train, y_test)

        model.fit(X_train, y_train)
        

    return points, winners

In [99]:
# Random Forest Regressor

test = merged.copy()
test = test[['season', 'round', 'driver', 'starting_grid', 'podium', 'driver_points_from_race']].query('season ==@N')

params={'criterion': ['squared_error'],
        'max_features': [1.0],
        'max_depth': [7]
        }

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:

            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion=criterion, max_features=max_features, max_depth=max_depth)
            model.fit(X_train, y_train)
            
            points, winner = get_predictions(X_train, y_train, model)

            test['predicted_points'] = points
            test['predicted_winner'] = winner

In [104]:
test.query('predicted_points == 1 & podium <= 3')

Unnamed: 0,season,round,driver,starting_grid,podium,driver_points_from_race,predicted_points,predicted_winner
2760,2021,1,lewis_hamilton,2,1,25.0,1,18.459459
2781,2021,2,lewis_hamilton,1,2,19.0,1,22.007723
2799,2021,3,lewis_hamilton,2,1,25.0,1,20.798858
2819,2021,4,lewis_hamilton,1,1,25.0,1,22.035526
2839,2021,5,max_verstappen,2,1,25.0,1,16.201242
2877,2021,7,lewis_hamilton,2,2,18.0,1,18.367224
2894,2021,8,max_verstappen,1,1,25.0,1,18.566575
2914,2021,9,max_verstappen,1,1,26.0,1,18.756401
2934,2021,10,lewis_hamilton,2,1,27.0,1,19.402093
2955,2021,11,lewis_hamilton,1,2,18.0,1,20.626886
