In [17]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'
    # path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [18]:
data = pd.read_csv(path+'data/ml_input.csv')
processed = pd.read_csv(path+'data/processed.csv')
parameters = pd.read_csv(path+'parameters/rf_regressor.csv')

In [19]:
### Season to test results

N = 2021

In [20]:
params_to_drop = ['season', 'round', 'driver', 'constructor', 'circuit_id', 'podium', 'driver_points_from']

In [21]:
df = data.copy()

train = df[df.season < N]

scaler = StandardScaler()

X_train = train.drop(params_to_drop, axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.driver_points_from.values)

In [22]:
def get_predictions(X_train, y_train, model):
    points = []
    winners = []
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(params_to_drop, axis=1)
        y_test = test.driver_points_from

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns=['predicted_points'])
        prediction_df['actual_points'] = y_test.reset_index(drop=True)

        prediction_df['predicted_winner'] = prediction_df.predicted_points.map(lambda x: 1 if x == prediction_df.predicted_points.max() else 0)
        prediction_df['actual_winner'] = prediction_df.actual_points.map(lambda x: 1 if x == prediction_df.actual_points.max() else 0)

        points += list(prediction_df.predicted_points.values)
        winners += list(prediction_df.predicted_winner.values)

        # Retrain model
        # X_train = pd.concat([X_train, X_test])
        # y_train = np.append(y_train, y_test)

        # model.fit(X_train, y_train)
        

    return points, winners

In [23]:
parameters.head()

Unnamed: 0,model,criterion,max_features,max_depth,score
0,random_forest_regressor,squared_error,0.8,51,0.736842
1,random_forest_regressor,squared_error,0.8,5,0.684211
2,random_forest_regressor,squared_error,0.8,45,0.684211
3,random_forest_regressor,squared_error,1.0,47,0.684211
4,random_forest_regressor,squared_error,1.0,41,0.684211


In [24]:
params = parameters.iloc[0]
criterion = params.criterion
max_features = params.max_features	
max_depth = params.max_depth

params

model           random_forest_regressor
criterion                 squared_error
max_features                        0.8
max_depth                            51
score                          0.736842
Name: 0, dtype: object

In [25]:
# Random Forest Regressor

test = processed.copy()
test = test[['season', 'round', 'driver', 'starting_grid', 'podium', 'driver_points_from', 'q_delta']].query('season ==@N')

model_params = (criterion, max_features, max_depth)
model = RandomForestRegressor(criterion=criterion, max_features=max_features, max_depth=max_depth)
model.fit(X_train, y_train)

points, winner = get_predictions(X_train, y_train, model)

test['predicted_points'] = points
test['predicted_winner'] = winner

In [26]:
test.query('predicted_winner == 1')

Unnamed: 0,season,round,driver,starting_grid,podium,driver_points_from,q_delta,predicted_points,predicted_winner
2742,2021,1,lewis_hamilton,2,1,25.0,0.388,17.862803,1
2763,2021,2,lewis_hamilton,1,2,19.0,0.0,21.6,1
2783,2021,3,valtteri_bottas,1,3,16.0,0.0,18.528333,1
2801,2021,4,lewis_hamilton,1,1,25.0,0.0,19.53,1
2839,2021,5,charles_leclerc,1,20,0.0,0.0,16.476,1
2853,2021,6,lewis_hamilton,2,15,0.0,0.0,16.0,1
2858,2021,7,max_verstappen,1,1,26.0,0.0,22.9,1
2876,2021,8,max_verstappen,1,1,25.0,0.0,17.51,1
2896,2021,9,max_verstappen,1,1,26.0,0.0,19.06,1
2917,2021,11,lewis_hamilton,1,2,18.0,0.0,16.09,1


In [29]:
test.query('round == 7')

Unnamed: 0,season,round,driver,starting_grid,podium,driver_points_from,q_delta,predicted_points,predicted_winner
2858,2021,7,max_verstappen,1,1,26.0,0.0,22.9,1
2859,2021,7,lewis_hamilton,2,2,18.0,0.0,17.63,0
2860,2021,7,sergio_perez,4,3,15.0,3.633,10.35,0
2861,2021,7,valtteri_bottas,3,4,12.0,0.0,12.37,0
2862,2021,7,lando_norris,8,5,10.0,4.44,3.6,0
2863,2021,7,daniel_ricciardo,10,6,8.0,1.392,3.81,0
2864,2021,7,pierre_gasly,6,7,6.0,4.056,7.88,0
2865,2021,7,fernando_alonso,9,8,4.0,4.528,3.53,0
2866,2021,7,sebastian_vettel,12,9,2.0,1.777,4.91,0
2867,2021,7,carlos_sainz,5,11,0.0,4.028,7.38,0


In [28]:
rounds = len(test['round'].unique())
correct = len(test.query('predicted_winner == 1 & podium == 1'))

(correct / rounds) * 100

63.1578947368421