In [1]:
import sqlite3

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

year_test = 2023

## Connect to Database
create train and test data

In [2]:
connection = sqlite3.connect('races.db')
table_name = 'i_df_dum_table'
query = f'SELECT * FROM "{table_name}"'
data = pd.read_sql_query(query, connection)

table_name = 'h_final_cleaned_table'
query = f'SELECT * FROM "{table_name}"'
info = pd.read_sql_query(query, connection)
connection.close()

data = data.reset_index(drop=True)

df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < year_test]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

## Use best Parameters found
fit the final model and calculate model score

In [3]:
# scoring function for classification

def score_classification(model):
    score = 0
    for circuit in df[df.season == year_test]['round'].unique():

        test = df[(df.season == year_test) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / len(df[df.season == year_test]['round'].unique())
    return model_score

In [4]:
# Train the final model with the best parameters on the entire training dataset
final_model = MLPClassifier(hidden_layer_sizes=(75, 25, 50, 10),
                            activation='identity',
                            solver='lbfgs',
                            alpha=0.01623776739188721,
                            random_state=1)
final_model.fit(X_train, y_train)

In [5]:
# # Train the final model with the best parameters on the entire training dataset
# final_model = MLPClassifier(hidden_layer_sizes=(80, 20, 40, 5),
#                             activation='identity',
#                             solver='lbfgs',
#                             alpha=0.007847599703514606,
#                             random_state=1)
# final_model.fit(X_train, y_train)

In [6]:
# Define a function to evaluate the final model
def evaluate_final_model(model):
    model_score = score_classification(model)
    return model_score

In [7]:
# Evaluate the final model and get the score
final_model_score = evaluate_final_model(final_model)

# Print or store the final_model_score
print(f"Final Model Score: {final_model_score}")

Final Model Score: 0.8


## Run Prediction

In [8]:
def score_classification_with_predictions(model):
    predictions_df_list = []  # To store predictions for each circuit
    score = 0

    for circuit in df[df.season == year_test]['round'].unique():
        podium_pos = data[(data.season == year_test) & (data['round'] == circuit)]
        test = df[(df.season == year_test) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis=1)
        y_test = test.podium

        # Scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # Make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df['driver'] = test.driver.reset_index(drop=True)
        prediction_df['grid'] = test.grid.reset_index(drop=True)
        prediction_df['podium'] = podium_pos.podium.reset_index(drop=True)
        prediction_df.sort_values('proba_1', ascending=False, inplace=True)
        prediction_df.reset_index(inplace=True, drop=True)
        prediction_df['predicted'] = prediction_df.index
#         prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        prediction_df['predicted'] = prediction_df['predicted'] + 1
        # Append predictions_df to the list
        predictions_df_list.append(prediction_df)

        
    # Concatenate all predictions DataFrames into one
    predictions_df_final = pd.concat(predictions_df_list)
    predictions_df_final = predictions_df_final[['driver', 'grid', 'podium', 'actual', 'predicted', 'proba_0', 'proba_1']]
    head = info[info['season'] == year_test]
    head = head[['season', 'round', 'circuit_id']]
    head = head.reset_index(drop=True)
    predictions_df_final = predictions_df_final.reset_index(drop=True)
    predictions_df_final = pd.concat([head, predictions_df_final], axis=1)
    

    return  predictions_df_final

In [9]:
# Evaluate the final model and get the score and predictions DataFrame
final_predictions_df = score_classification_with_predictions(final_model)

In [10]:
# score on whole grid
grid_score = pd.DataFrame()
grid_score['actual'] = final_predictions_df['podium']
grid_score['predicted'] = final_predictions_df['predicted']
grid_score['actual_y'] = 1
grid_score['pred_y'] = np.where(final_predictions_df['podium'] == final_predictions_df['predicted'], 1, 0) 

score = 0
b = grid_score['pred_y']
for x in b:
    if x == 1:
        score += 1
score / len(b)
    

0.17666666666666667

In [11]:
grid_score

Unnamed: 0,actual,predicted,actual_y,pred_y
0,1,1,1,1
1,2,2,1,1
2,4,3,1,0
3,3,4,1,0
4,19,5,1,0
5,5,6,1,0
6,7,7,1,1
7,6,8,1,0
8,17,9,1,0
9,18,10,1,0


In [12]:
final_predictions_df[final_predictions_df['round'] == 15].drop(columns=['actual'])

Unnamed: 0,season,round,circuit_id,driver,grid,podium,predicted,proba_0,proba_1
280,2023,15,marina_bay,sainz,1.0,1,1,0.830047,0.1699529
281,2023,15,marina_bay,russell,2.0,2,2,0.887084,0.1129164
282,2023,15,marina_bay,hamilton,5.0,5,3,0.932986,0.06701386
283,2023,15,marina_bay,max_verstappen,11.0,11,4,0.945788,0.05421228
284,2023,15,marina_bay,leclerc,3.0,3,5,0.961106,0.03889444
285,2023,15,marina_bay,norris,4.0,4,6,0.962401,0.03759886
286,2023,15,marina_bay,alonso,7.0,7,7,0.971143,0.02885726
287,2023,15,marina_bay,lawson,10.0,10,8,0.997445,0.002555358
288,2023,15,marina_bay,ocon,8.0,8,9,0.997585,0.002414547
289,2023,15,marina_bay,perez,13.0,13,10,0.997799,0.002201417


In [13]:
# final_predictions_df[(final_predictions_df['podium'] == 1) | (final_predictions_df['predicted'] == 1)]
final_predictions_df[(final_predictions_df['podium'] == 1)]


Unnamed: 0,season,round,circuit_id,driver,grid,podium,actual,predicted,proba_0,proba_1
0,2023,1,bahrain,max_verstappen,1.0,1,1,1,0.470859,0.529141
21,2023,2,jeddah,perez,1.0,1,1,2,0.729352,0.270648
40,2023,3,albert_park,max_verstappen,1.0,1,1,1,0.515203,0.484797
61,2023,4,baku,perez,3.0,1,1,2,0.857264,0.142736
83,2023,5,miami,max_verstappen,9.0,1,1,4,0.948267,0.051733
100,2023,6,monaco,max_verstappen,1.0,1,1,1,0.39992,0.60008
120,2023,7,catalunya,max_verstappen,1.0,1,1,1,0.377069,0.622931
140,2023,8,villeneuve,max_verstappen,1.0,1,1,1,0.348683,0.651317
160,2023,9,red_bull_ring,max_verstappen,1.0,1,1,1,0.29661,0.70339
180,2023,10,silverstone,max_verstappen,1.0,1,1,1,0.299021,0.700979


In [14]:
final_predictions_df[(final_predictions_df['podium'] == 1) | (final_predictions_df['predicted'] == 1)]


Unnamed: 0,season,round,circuit_id,driver,grid,podium,actual,predicted,proba_0,proba_1
0,2023,1,bahrain,max_verstappen,1.0,1,1,1,0.470859,0.529141
20,2023,2,jeddah,alonso,2.0,3,0,1,0.695258,0.304742
21,2023,2,jeddah,perez,1.0,1,1,2,0.729352,0.270648
40,2023,3,albert_park,max_verstappen,1.0,1,1,1,0.515203,0.484797
60,2023,4,baku,max_verstappen,2.0,2,0,1,0.523038,0.476962
61,2023,4,baku,perez,3.0,1,1,2,0.857264,0.142736
80,2023,5,miami,perez,1.0,2,0,1,0.668703,0.331297
83,2023,5,miami,max_verstappen,9.0,1,1,4,0.948267,0.051733
100,2023,6,monaco,max_verstappen,1.0,1,1,1,0.39992,0.60008
120,2023,7,catalunya,max_verstappen,1.0,1,1,1,0.377069,0.622931


In [15]:
final_predictions_df.drop(columns=['actual', 'grid'])

Unnamed: 0,season,round,circuit_id,driver,podium,predicted,proba_0,proba_1
0,2023,1,bahrain,max_verstappen,1,1,0.470859,0.5291409
1,2023,1,bahrain,perez,2,2,0.807419,0.1925809
2,2023,1,bahrain,sainz,4,3,0.877821,0.122179
3,2023,1,bahrain,alonso,3,4,0.880402,0.1195981
4,2023,1,bahrain,leclerc,19,5,0.903393,0.09660673
5,2023,1,bahrain,hamilton,5,6,0.938939,0.06106109
6,2023,1,bahrain,russell,7,7,0.944428,0.05557156
7,2023,1,bahrain,stroll,6,8,0.98033,0.01966955
8,2023,1,bahrain,norris,17,9,0.994658,0.005342405
9,2023,1,bahrain,ocon,18,10,0.994943,0.005056505
