In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [2]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')
parameters = pd.read_csv(path+'parameters/neutralnetworkclassifier.csv')

In [4]:
### Season to test results

N = 2021

In [73]:
data.columns

Index(['season', 'round', 'podium', 'driver_wins_after_race',
       'driver_standings_pos_after_race', 'driver_points', 'driver_wins',
       'driver_standings_pos', 'constructor_wins_after_race',
       'constructor_standings_pos_after_race', 'constructor_points',
       'constructor_wins', 'constructor_standings_pos', 'average_pace', 'grid',
       'qual_time', 'circuit_id_albert_park', 'circuit_id_americas',
       'circuit_id_bahrain', 'circuit_id_baku', 'circuit_id_catalunya',
       'circuit_id_hockenheimring', 'circuit_id_hungaroring',
       'circuit_id_imola', 'circuit_id_interlagos', 'circuit_id_istanbul',
       'circuit_id_jeddah', 'circuit_id_losail', 'circuit_id_marina_bay',
       'circuit_id_monaco', 'circuit_id_monza', 'circuit_id_mugello',
       'circuit_id_nurburgring', 'circuit_id_portimao',
       'circuit_id_red_bull_ring', 'circuit_id_ricard', 'circuit_id_rodriguez',
       'circuit_id_sepang', 'circuit_id_shanghai', 'circuit_id_silverstone',
       'circuit_id

In [36]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season != N]
X_train = train.drop(['podium', 'average_pace','driver_wins_after_race','driver_standings_pos_after_race','constructor_standings_pos_after_race'], axis=1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [30]:
def score_classification(model):
    score = 0
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['podium', 'average_pace'], axis = 1)
        y_test = test.podium
        
        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace=True, drop=True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / df[df.season == N]['round'].unique().max()
    return model_score

In [7]:
comparison_dict ={'model':[],
                  'hidden_layer_sizes': [],
                  'activation': [],
                  'solver': [],
                  'alpha': [],
                  'score': []
                  }

In [8]:
# Neural network

params={'hidden_layer_sizes': [(80,20,40,5), (75,25,50,10)], 
        'activation': ['identity', 'logistic', 'tanh', 'relu'], 
        'solver': ['lbfgs', 'sgd', 'adam'], 
        'alpha': np.logspace(-4,2,20)} 

parms = parameters


for hidden_layer_sizes in params['hidden_layer_sizes']:
    for activation in params['activation']:
        for solver in params['solver']:
            for alpha in params['alpha']:
                model_params = (hidden_layer_sizes, activation, solver, alpha)
                model = MLPClassifier(
                    hidden_layer_sizes=hidden_layer_sizes,
                    activation=activation, 
                    solver=solver, alpha=alpha, 
                    random_state=1)

                model.fit(X_train, y_train)

                model_score = score_classification(model)

                comparison_dict['model'].append('neural_network_classifier')
                comparison_dict['hidden_layer_sizes'].append(hidden_layer_sizes)
                comparison_dict['activation'].append(activation)
                comparison_dict['solver'].append(solver)
                comparison_dict['alpha'].append(alpha)
                comparison_dict['score'].append(model_score)

KeyboardInterrupt: 

In [9]:
comparison_df = pd.DataFrame(comparison_dict)

comparison_df.sort_values('score', ascending=False)

Unnamed: 0,model,hidden_layer_sizes,activation,solver,alpha,score
0,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.0001,0.727273
9,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.069519,0.727273
15,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,5.455595,0.727273
14,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,2.636651,0.727273
13,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,1.274275,0.727273
12,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.615848,0.727273
11,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.297635,0.727273
10,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.143845,0.727273
8,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.033598,0.727273
1,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.000207,0.727273


In [10]:
chosen_inputs = comparison_df.query('score == 0.7272727272727273')

# chosen_inputs.to_csv(path+'parameters/neutralnetworkclassifier.csv')

In [11]:
parameters.head(1)

Unnamed: 0.1,Unnamed: 0,model,hidden_layer_sizes,activation,solver,alpha,score
0,0,neural_network_classifier,"(80, 20, 40, 5)",identity,lbfgs,0.0001,0.727273


In [68]:
def get_predictions(model):
    predictions = []
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['podium', 'average_pace'], axis = 1)
        y_test = test.podium
        
        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        # prediction_df.sort_values('proba_1', ascending=False, inplace=True)
        # prediction_df.reset_index(inplace=True, drop=True)
        # prediction_df['predicted'] = prediction_df.index
        # prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        predictions += list(prediction_df.proba_1.values)

    return predictions

In [69]:
N = 2021

In [70]:
hidden_layer_sizes= [(80,20,40,5), (75, 25, 50, 10)][0]
activation='identity'
solver='lbfgs'
alpha=0.00001

model = MLPClassifier(
    hidden_layer_sizes=hidden_layer_sizes,
    activation=activation, 
    solver=solver, alpha=alpha, 
    random_state=1)

model.fit(X_train, y_train)

model_score = score_classification(model)
predictions = get_predictions(model)

print(model_score)

0.7272727272727273


In [71]:
test = merged.query('season ==@N')
test['probability'] = predictions
test = test[['season', 'round', 'circuit_id', 'driver', 'grid', 'podium', 'predicted']]

In [72]:
test.query('predicted == 1')

Unnamed: 0,season,round,circuit_id,driver,grid,podium,predicted
2379,2021,6,baku,perez,7,1,1.0
2415,2021,8,red_bull_ring,max_verstappen,1,1,1.0
2434,2021,9,red_bull_ring,max_verstappen,1,1,1.0
2505,2021,14,monza,ricciardo,5,1,1.0
2543,2021,16,istanbul,bottas,2,1,1.0
2601,2021,19,interlagos,hamilton,1,1,1.0
