In [12]:
import sqlite3

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

year_test = 2024

## Connect to Database
create train and test data

In [13]:
connection = sqlite3.connect('races.db')
table_name = 'i_df_dum_table'
query = f'SELECT * FROM "{table_name}"'
data = pd.read_sql_query(query, connection)

table_name = 'h_final_cleaned_table'
query = f'SELECT * FROM "{table_name}"'
info = pd.read_sql_query(query, connection)
connection.close()

data = data.reset_index(drop=True)
# data = data[data['circuit_id_monaco'] == 1]

df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df.iloc[:-20]
drivers_X = train['driver']
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

# check = pd.concat([drivers_X, X_train], axis=1)

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

# check[-100:]

In [14]:
#JUST ONE RACE TO PREDICT
pred_info = info.iloc[-20:]
pred_data = data.iloc[-20:]
pred_df = df.iloc[-20:]

## Use best Parameters found
fit the final model and calculate model score

In [15]:
# scoring function for classification

def score_classification(model):
    score = 0
    for circuit in pred_df[pred_df.season == year_test]['round'].unique():

        test = pred_df[(pred_df.season == year_test) & (pred_df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / len(pred_df[pred_df.season == year_test]['round'].unique())
    return model_score

In [16]:
# Train the final model with the best parameters on the entire training dataset
final_model = MLPClassifier(hidden_layer_sizes=(75, 25, 50, 10),
                            activation='identity',
                            solver='lbfgs',
                            alpha=0.01623776739188721,
                            random_state=1)
final_model.fit(X_train, y_train)

In [17]:
# Define a function to evaluate the final model
def evaluate_final_model(model):
    model_score = score_classification(model)
    return model_score

In [18]:
# Evaluate the final model and get the score
final_model_score = evaluate_final_model(final_model)

# Print or store the final_model_score
print(f"Final Model Score: {final_model_score}")

Final Model Score: 0.0


## Run Prediction

In [19]:
def score_classification_with_predictions(model):
    predictions_df_list = []  # To store predictions for each circuit
    score = 0

    for circuit in pred_df[pred_df.season == year_test]['round'].unique():
        podium_pos = pred_data[(pred_data.season == year_test) & (pred_data['round'] == circuit)]
        test = pred_df[(pred_df.season == year_test) & (pred_df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis=1)
        y_test = test.podium

        # Scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # Make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df['driver'] = test.driver.reset_index(drop=True)
        prediction_df['grid'] = test.grid.reset_index(drop=True)
        prediction_df['podium'] = podium_pos.podium.reset_index(drop=True)
        prediction_df.sort_values('proba_1', ascending=False, inplace=True)
        prediction_df.reset_index(inplace=True, drop=True)
        prediction_df['predicted'] = prediction_df.index
#         prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        prediction_df['predicted'] = prediction_df['predicted'] + 1
        # Append predictions_df to the list
        predictions_df_list.append(prediction_df)

        
    # Concatenate all predictions DataFrames into one
    predictions_df_final = pd.concat(predictions_df_list)
    predictions_df_final = predictions_df_final[['driver', 'grid', 'podium', 'actual', 'predicted', 'proba_0', 'proba_1']]
    head = pred_info[pred_info['season'] == year_test]
    head = head[['season', 'round', 'circuit_id']]
    head = head.reset_index(drop=True)
    predictions_df_final = predictions_df_final.reset_index(drop=True)
    predictions_df_final = pd.concat([head, predictions_df_final], axis=1)
    

    return  predictions_df_final

In [20]:
# Evaluate the final model and get the score and predictions DataFrame
final_predictions_df = score_classification_with_predictions(final_model)
# final_predictions_df = final_predictions_df.drop(columns=['podium', 'actual'])
final_predictions_df = final_predictions_df.drop(columns=['actual'])

In [21]:
# score on whole grid
grid_score = pd.DataFrame()
grid_score['actual'] = final_predictions_df['podium']
grid_score['predicted'] = final_predictions_df['predicted']
grid_score['actual_y'] = 1
grid_score['pred_y'] = np.where(final_predictions_df['podium'] == final_predictions_df['predicted'], 1, 0) 

score = 0
b = grid_score['pred_y']
for x in b:
    if x == 1:
        score += 1
score / len(b)
    
# precision_score(grid_score['actual_y'], grid_score['pred_y'])

0.0

In [22]:
s = final_predictions_df.drop(columns=['podium'])
s['proba_1'] = round(s['proba_1'], 3)
s

Unnamed: 0,season,round,circuit_id,driver,grid,predicted,proba_0,proba_1
0,2024,15,zandvoort,max_verstappen,2,1,0.390803,0.609
1,2024,15,zandvoort,norris,1,2,0.816037,0.184
2,2024,15,zandvoort,russell,4,3,0.948228,0.052
3,2024,15,zandvoort,piastri,3,4,0.968464,0.032
4,2024,15,zandvoort,perez,5,5,0.969151,0.031
5,2024,15,zandvoort,leclerc,6,6,0.985201,0.015
6,2024,15,zandvoort,hamilton,12,7,0.995751,0.004
7,2024,15,zandvoort,sainz,11,8,0.996637,0.003
8,2024,15,zandvoort,albon,8,9,0.998099,0.002
9,2024,15,zandvoort,gasly,10,10,0.999336,0.001


In [12]:
grid_score

Unnamed: 0,actual,predicted,actual_y,pred_y
0,0,1,1,0
1,0,2,1,0
2,0,3,1,0
3,0,4,1,0
4,0,5,1,0
5,0,6,1,0
6,0,7,1,0
7,0,8,1,0
8,0,9,1,0
9,0,10,1,0


In [13]:
final_predictions_df[final_predictions_df['circuit_id'] == 'shanghai']
# final_predictions_df.drop(columns=['season', 'round', 'grid', 'proba_0', 'proba_1'])

Unnamed: 0,season,round,circuit_id,driver,grid,podium,predicted,proba_0,proba_1
