In [85]:
import sqlite3

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

year_test = 2024

## Connect to Database
create train and test data

In [86]:
connection = sqlite3.connect('races.db')
table_name = 'i_df_dum_table'
query = f'SELECT * FROM "{table_name}"'
data = pd.read_sql_query(query, connection)

table_name = 'h_final_cleaned_table'
query = f'SELECT * FROM "{table_name}"'
info = pd.read_sql_query(query, connection)
connection.close()

data = data.reset_index(drop=True)

df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < year_test]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

## Use best Parameters found
fit the final model and calculate model score

In [87]:
# scoring function for classification

def score_classification(model):
    score = 0
    for circuit in df[df.season == year_test]['round'].unique():

        test = df[(df.season == year_test) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / len(df[df.season == year_test]['round'].unique())
    return model_score

In [88]:
# Train the final model with the best parameters on the entire training dataset
final_model = MLPClassifier(hidden_layer_sizes=(75, 25, 50, 10),
                            activation='identity',
                            solver='lbfgs',
                            alpha=0.01623776739188721,
                            random_state=1)
final_model.fit(X_train, y_train)

In [89]:
# # Train the final model with the best parameters on the entire training dataset
# final_model = MLPClassifier(hidden_layer_sizes=(80, 20, 40, 5),
#                             activation='identity',
#                             solver='lbfgs',
#                             alpha=0.007847599703514606,
#                             random_state=1)
# final_model.fit(X_train, y_train)

In [90]:
# Define a function to evaluate the final model
def evaluate_final_model(model):
    model_score = score_classification(model)
    return model_score

In [91]:
# Evaluate the final model and get the score
final_model_score = evaluate_final_model(final_model)

# Print or store the final_model_score
print(f"Final Model Score: {final_model_score}")

Final Model Score: 0.6666666666666666


## Run Prediction

In [92]:
def score_classification_with_predictions(model):
    predictions_df_list = []  # To store predictions for each circuit
    score = 0

    for circuit in df[df.season == year_test]['round'].unique():
        podium_pos = data[(data.season == year_test) & (data['round'] == circuit)]
        test = df[(df.season == year_test) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis=1)
        y_test = test.podium

        # Scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # Make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df['driver'] = test.driver.reset_index(drop=True)
        prediction_df['grid'] = test.grid.reset_index(drop=True)
        prediction_df['podium'] = podium_pos.podium.reset_index(drop=True)
        prediction_df.sort_values('proba_1', ascending=False, inplace=True)
        prediction_df.reset_index(inplace=True, drop=True)
        prediction_df['predicted'] = prediction_df.index
#         prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        prediction_df['predicted'] = prediction_df['predicted'] + 1
        # Append predictions_df to the list
        predictions_df_list.append(prediction_df)

        
    # Concatenate all predictions DataFrames into one
    predictions_df_final = pd.concat(predictions_df_list)
    predictions_df_final = predictions_df_final[['driver', 'grid', 'podium', 'actual', 'predicted', 'proba_0', 'proba_1']]
    head = info[info['season'] == year_test]
    head = head[['season', 'round', 'circuit_id']]
    head = head.reset_index(drop=True)
    predictions_df_final = predictions_df_final.reset_index(drop=True)
    predictions_df_final = pd.concat([head, predictions_df_final], axis=1)
    

    return  predictions_df_final

In [93]:
# Evaluate the final model and get the score and predictions DataFrame
final_predictions_df = score_classification_with_predictions(final_model)

In [94]:
# score on whole grid
grid_score = pd.DataFrame()
grid_score['actual'] = final_predictions_df['podium']
grid_score['predicted'] = final_predictions_df['predicted']
grid_score['actual_y'] = 1
grid_score['pred_y'] = np.where(final_predictions_df['podium'] == final_predictions_df['predicted'], 1, 0) 

score = 0
b = grid_score['pred_y']
for x in b:
    if x == 1:
        score += 1
score / len(b)
    

0.09166666666666666

In [95]:
grid_score

Unnamed: 0,actual,predicted,actual_y,pred_y
0,1,1,1,1
1,4,2,1,0
2,5,3,1,0
3,2,4,1,0
4,3,5,1,0
5,7,6,1,0
6,13,7,1,0
7,9,8,1,0
8,17,9,1,0
9,8,10,1,0


In [96]:
final_predictions_df[final_predictions_df['round'] == 6].drop(columns=['actual'])

Unnamed: 0,season,round,circuit_id,driver,grid,podium,predicted,proba_0,proba_1
100,2024,6,miami,max_verstappen,1,0,1,0.314896,0.6851039
101,2024,6,miami,perez,3,0,2,0.857386,0.1426145
102,2024,6,miami,leclerc,2,0,3,0.886209,0.1137908
103,2024,6,miami,ricciardo,4,0,4,0.956562,0.04343784
104,2024,6,miami,sainz,5,0,5,0.975867,0.02413328
105,2024,6,miami,hamilton,12,0,6,0.992807,0.007192954
106,2024,6,miami,russell,11,0,7,0.997947,0.002052942
107,2024,6,miami,ocon,13,0,8,0.998713,0.001286666
108,2024,6,miami,alonso,8,0,9,0.999817,0.0001832365
109,2024,6,miami,piastri,6,0,10,0.999865,0.0001350777


In [97]:
# final_predictions_df[(final_predictions_df['podium'] == 1) | (final_predictions_df['predicted'] == 1)]
final_predictions_df[(final_predictions_df['podium'] == 1)]


Unnamed: 0,season,round,circuit_id,driver,grid,podium,actual,predicted,proba_0,proba_1
0,2024,1,bahrain,max_verstappen,1,1,1,1,0.429029,0.570971
20,2024,2,jeddah,max_verstappen,1,1,1,1,0.326217,0.673783
41,2024,3,albert_park,sainz,2,1,1,2,0.947385,0.052615
60,2024,4,suzuka,max_verstappen,1,1,1,1,0.439853,0.560147
80,2024,5,shanghai,max_verstappen,1,1,1,1,0.360707,0.639293


In [98]:
final_predictions_df[(final_predictions_df['podium'] == 1) | (final_predictions_df['predicted'] == 1)]


Unnamed: 0,season,round,circuit_id,driver,grid,podium,actual,predicted,proba_0,proba_1
0,2024,1,bahrain,max_verstappen,1,1,1,1,0.429029,0.570971
20,2024,2,jeddah,max_verstappen,1,1,1,1,0.326217,0.673783
40,2024,3,albert_park,max_verstappen,1,19,0,1,0.436427,0.563573
41,2024,3,albert_park,sainz,2,1,1,2,0.947385,0.052615
60,2024,4,suzuka,max_verstappen,1,1,1,1,0.439853,0.560147
80,2024,5,shanghai,max_verstappen,1,1,1,1,0.360707,0.639293
100,2024,6,miami,max_verstappen,1,0,0,1,0.314896,0.685104


In [99]:
final_predictions_df.drop(columns=['actual', 'grid'])

Unnamed: 0,season,round,circuit_id,driver,podium,predicted,proba_0,proba_1
0,2024,1,bahrain,max_verstappen,1,1,0.429029,0.5709706
1,2024,1,bahrain,leclerc,4,2,0.891343,0.1086571
2,2024,1,bahrain,russell,5,3,0.932965,0.06703452
3,2024,1,bahrain,perez,2,4,0.953107,0.046893
4,2024,1,bahrain,sainz,3,5,0.964458,0.0355424
5,2024,1,bahrain,hamilton,7,6,0.966864,0.03313598
6,2024,1,bahrain,ricciardo,13,7,0.998756,0.001244001
7,2024,1,bahrain,alonso,9,8,0.999406,0.0005936418
8,2024,1,bahrain,ocon,17,9,0.999795,0.0002054157
9,2024,1,bahrain,piastri,8,10,0.999928,7.169454e-05
