In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'
    # path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [2]:
data = pd.read_csv(path+'data/ml_input.csv')
processed = pd.read_csv(path+'data/processed.csv')

In [3]:
### Season to test results

N = 2022

In [4]:
df = data.copy()

train = df[df.season < N]

scaler = StandardScaler()

X_train = train.drop([
        'season', 'round',
        'podium',
        'driver_points_from_race',
        'constructor_points_from_race', 
        'driver_points_before_race'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.driver_points_from_race.values)

In [5]:
X_train

Unnamed: 0,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,q_delta,starting_grid,driver_points_per_race,constructor_points_per_race,...,circuit_id_rodriguez,circuit_id_sepang,circuit_id_shanghai,circuit_id_silverstone,circuit_id_sochi,circuit_id_spa,circuit_id_suzuka,circuit_id_villeneuve,circuit_id_yas_marina,circuit_id_zandvoort
0,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-0.939483,-1.293114,-0.814282,-0.848059,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
1,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-0.939483,-1.120475,-0.814282,-0.848059,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
2,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-0.849924,-0.084641,-0.814282,-0.848059,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
3,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-0.939483,-0.947836,-0.814282,-0.848059,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
4,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,0.763012,0.778554,-0.814282,-0.848059,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3183,2.064609,0.364443,-0.969673,3.446258,3.579659,-1.058870,-0.575160,-1.120475,0.907091,1.788468,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354
3184,-0.586521,-0.318192,1.079332,-0.534595,-0.372396,0.864776,0.029583,0.951193,-0.750528,-0.739112,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354
3185,-0.644469,-0.318192,1.236948,-0.608864,-0.372396,1.185384,-0.008241,0.605915,-0.786390,-0.786841,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354
3186,-0.456138,-0.318192,0.764100,-0.534595,-0.372396,0.864776,0.066537,1.123832,-0.668842,-0.739112,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354


In [6]:
def get_predictions(X_train, y_train, model):
    points = []
    winners = []
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop([
            'season', 'round',
            'podium',
            'driver_points_from_race',
            'constructor_points_from_race', 
            'driver_points_before_race'], axis=1)
        y_test = test.driver_points_from_race

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns=['predicted_points'])
        prediction_df['actual_points'] = y_test.reset_index(drop=True)

        prediction_df['predicted_winner'] = prediction_df.predicted_points.map(lambda x: 1 if x == prediction_df.predicted_points.max() else 0)
        prediction_df['actual_winner'] = prediction_df.actual_points.map(lambda x: 1 if x == prediction_df.actual_points.max() else 0)

        points += list(prediction_df.predicted_winner.values)
        winners += list(prediction_df.predicted_points.values)

        # Retrain model
        X_train = pd.concat([X_train, X_test])
        y_train = np.append(y_train, y_test)

        model.fit(X_train, y_train)
        

    return points, winners

In [7]:
# Random Forest Regressor

test = processed.copy()
test = test[['season', 'round', 'driver', 'starting_grid', 'podium', 'driver_points_from_race', 'q_delta']].query('season ==@N')

params={'criterion': ['squared_error'],
        'max_features': [1.0],
        'max_depth': [7]
        }

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:

            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion=criterion, max_features=max_features, max_depth=max_depth)
            model.fit(X_train, y_train)
            
            points, winner = get_predictions(X_train, y_train, model)

            test['predicted_points'] = points
            test['predicted_winner'] = winner

In [8]:
test.query('predicted_points == 1 & podium == 1')

Unnamed: 0,season,round,driver,starting_grid,podium,driver_points_from_race,q_delta,predicted_points,predicted_winner
3188,2022,1,charles_leclerc,1,1,26.0,0.0,1,16.90212
3226,2022,3,charles_leclerc,1,1,26.0,0.0,1,18.876247
3245,2022,4,max_verstappen,1,1,34.0,0.0,1,16.111293
3342,2022,9,max_verstappen,1,1,25.0,0.0,1,18.19518
3362,2022,10,carlos_sainz,1,1,25.0,0.0,1,16.540293
3441,2022,14,max_verstappen,14,1,26.0,0.0,1,16.467065
3461,2022,15,max_verstappen,1,1,26.0,0.0,1,17.249057
3481,2022,16,max_verstappen,7,1,25.0,0.145,1,19.131068
3520,2022,18,max_verstappen,1,1,25.0,0.0,1,21.06805
3539,2022,19,max_verstappen,2,1,25.0,0.092,1,19.105547
