In [187]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'
    # path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [188]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')

In [189]:
### Season to test results

N = 2022

In [190]:
df = data.copy()

train = df[df.season < N]

scaler = StandardScaler()

X_train = train.drop(['season', 'round', 'podium', 'driver_points_from_race', 'constructor_points_from_race', 'qualifying_pos'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.driver_points_from_race.values)

In [191]:
X_train

Unnamed: 0,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,starting_grid,stage_q1,stage_q2,stage_q3,...,circuit_id_rodriguez,circuit_id_sepang,circuit_id_shanghai,circuit_id_silverstone,circuit_id_sochi,circuit_id_spa,circuit_id_suzuka,circuit_id_villeneuve,circuit_id_yas_marina,circuit_id_zandvoort
0,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-1.293114,-0.587493,-0.588942,1.028641,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
1,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-1.120475,-0.587493,-0.588942,1.028641,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
2,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-0.084641,-0.587493,1.697960,-0.972157,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
3,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,-0.947836,-0.587493,-0.588942,1.028641,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
4,-0.687930,-0.318192,-1.600137,-0.705415,-0.372396,-1.700086,0.778554,-0.587493,-0.588942,1.028641,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,-0.228353,-0.075354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3183,2.064609,0.364443,-0.969673,3.446258,3.579659,-1.058870,-1.120475,-0.587493,-0.588942,1.028641,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354
3184,-0.586521,-0.318192,1.079332,-0.534595,-0.372396,0.864776,0.951193,1.702149,-0.588942,-0.972157,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354
3185,-0.644469,-0.318192,1.236948,-0.608864,-0.372396,1.185384,0.605915,-0.587493,1.697960,-0.972157,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354
3186,-0.456138,-0.318192,0.764100,-0.534595,-0.372396,0.864776,1.123832,1.702149,-0.588942,-0.972157,...,-0.196052,-0.162482,-0.196913,-0.244623,-0.230625,-0.231379,-0.198626,-0.196913,4.379180,-0.075354


In [192]:
def get_predictions(X_train, y_train, model):
    predictions = []
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['season','round','podium', 'driver_points_from_race', 'constructor_points_from_race', 'qualifying_pos'], axis=1)
        y_test = test.driver_points_from_race

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns=['predicted_points'])
        prediction_df['actual_points'] = y_test.reset_index(drop=True)

        predictions += list(prediction_df.predicted_points.values)

        # Retrain model
        X_train = pd.concat([X_train, X_test])
        y_train = np.append(y_train, y_test)

        model.fit(X_train, y_train)
        

    return predictions

In [193]:
# Random Forest Regressor

test = merged.copy()
test = test[['season', 'round', 'driver', 'starting_grid', 'podium', 'driver_points_from_race']].query('season == 2022'
)

params={'criterion': ['squared_error'],
        'max_features': [0.8, 1.0],
        'max_depth': [7, 43]
        }

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:

            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion=criterion, max_features=max_features, max_depth=max_depth)
            model.fit(X_train, y_train)
            
            predictions = get_predictions(X_train, y_train, model)
            test['predicted_points_{}'.format(max_depth)] = predictions

0.8
7
43
1.0
7
43


In [225]:
test.query('round == 21').sort_values('predicted_points_7', ascending=False)

Unnamed: 0,season,round,driver,starting_grid,podium,driver_points_from_race,predicted_points_7,predicted_points_43
3579,2022,21,george_russell,1,1,34.0,15.933595,12.915
3584,2022,21,max_verstappen,3,6,13.0,14.192317,8.95
3580,2022,21,lewis_hamilton,2,2,24.0,12.517562,8.54
3582,2022,21,charles_leclerc,5,4,15.0,11.133578,10.045
3585,2022,21,sergio_perez,4,7,10.0,10.189806,7.615
3596,2022,21,lando_norris,6,18,2.0,9.335313,11.815
3581,2022,21,carlos_sainz,7,3,22.0,8.829945,8.21
3598,2022,21,daniel_ricciardo,11,20,0.0,3.751188,5.37
3583,2022,21,fernando_alonso,17,5,10.0,3.331607,2.92
3586,2022,21,esteban_ocon,16,8,4.0,3.215961,2.04
