In [1]:
import sqlite3

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

year_test = 2023

## Connect to Database
create train and test data

In [2]:
connection = sqlite3.connect('races.db')
table_name = 'i_df_dum_table'
query = f'SELECT * FROM "{table_name}"'
data = pd.read_sql_query(query, connection)

table_name = 'h_final_cleaned_table'
query = f'SELECT * FROM "{table_name}"'
info = pd.read_sql_query(query, connection)
connection.close()

data = data.reset_index(drop=True)

df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < year_test]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

## Use best Parameters found
fit the final model and calculate model score

In [3]:
# scoring function for classification

def score_classification(model):
    score = 0
    for circuit in df[df.season == year_test]['round'].unique():

        test = df[(df.season == year_test) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / len(df[df.season == year_test]['round'].unique())
    return model_score

In [7]:
# Train the final model with the best parameters on the entire training dataset
final_model = MLPClassifier(hidden_layer_sizes=(75, 25, 50, 10),
                            activation='identity',
                            solver='sgd',
                            alpha=5.455594781168514,
                            random_state=1)
final_model.fit(X_train, y_train)

In [4]:
# Train the final model with the best parameters on the entire training dataset
final_model = MLPClassifier(hidden_layer_sizes=(80, 20, 40, 5),
                            activation='identity',
                            solver='lbfgs',
                            alpha=0.007847599703514606,
                            random_state=1)
final_model.fit(X_train, y_train)

In [8]:
# Define a function to evaluate the final model
def evaluate_final_model(model):
    model_score = score_classification(model)
    return model_score

In [9]:
# Evaluate the final model and get the score
final_model_score = evaluate_final_model(final_model)

# Print or store the final_model_score
print(f"Final Model Score: {final_model_score}")

Final Model Score: 0.9285714285714286


## Run Prediction

In [10]:
def score_classification_with_predictions(model):
    predictions_df_list = []  # To store predictions for each circuit
    score = 0

    for circuit in df[df.season == year_test]['round'].unique():
        podium_pos = data[(data.season == year_test) & (data['round'] == circuit)]
        test = df[(df.season == year_test) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis=1)
        y_test = test.podium

        # Scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # Make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df['driver'] = test.driver.reset_index(drop=True)
        prediction_df['grid'] = test.grid.reset_index(drop=True)
        prediction_df['podium'] = podium_pos.podium.reset_index(drop=True)
        prediction_df.sort_values('proba_1', ascending=False, inplace=True)
        prediction_df.reset_index(inplace=True, drop=True)
        prediction_df['predicted'] = prediction_df.index
#         prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        prediction_df['predicted'] = prediction_df['predicted'] + 1
        # Append predictions_df to the list
        predictions_df_list.append(prediction_df)

        
    # Concatenate all predictions DataFrames into one
    predictions_df_final = pd.concat(predictions_df_list)
    predictions_df_final = predictions_df_final[['driver', 'grid', 'podium', 'actual', 'predicted', 'proba_0', 'proba_1']]
    head = info[info['season'] == year_test]
    head = head[['season', 'round', 'circuit_id']]
    head = head.reset_index(drop=True)
    predictions_df_final = predictions_df_final.reset_index(drop=True)
    predictions_df_final = pd.concat([head, predictions_df_final], axis=1)
    

    return  predictions_df_final

In [11]:
# Evaluate the final model and get the score and predictions DataFrame
final_predictions_df = score_classification_with_predictions(final_model)

In [12]:
# score on whole grid
grid_score = pd.DataFrame()
grid_score['actual'] = final_predictions_df['podium']
grid_score['predicted'] = final_predictions_df['predicted']
grid_score['actual_y'] = 1
grid_score['pred_y'] = np.where(final_predictions_df['podium'] == final_predictions_df['predicted'], 1, 0) 

score = 0
b = grid_score['pred_y']
for x in b:
    if x == 1:
        score += 1
score / len(b)
    

0.17142857142857143

In [14]:
grid_score

Unnamed: 0,actual,predicted,actual_y,pred_y
0,1,1,1,1
1,5,2,1,0
2,7,3,1,0
3,4,4,1,1
4,19,5,1,0
5,2,6,1,0
6,3,7,1,0
7,17,8,1,0
8,15,9,1,0
9,6,10,1,0


In [13]:
final_predictions_df[final_predictions_df['round'] == 14].drop(columns=['actual'])

Unnamed: 0,season,round,circuit_id,driver,grid,podium,predicted,proba_0,proba_1
260,2023,14,monza,max_verstappen,2.0,1,1,0.248032,0.751968
261,2023,14,monza,perez,5.0,2,2,0.804393,0.195607
262,2023,14,monza,russell,4.0,5,3,0.930717,0.069283
263,2023,14,monza,hamilton,8.0,6,4,0.93181,0.06819
264,2023,14,monza,sainz,1.0,3,5,0.936672,0.063328
265,2023,14,monza,leclerc,3.0,4,6,0.952348,0.047652
266,2023,14,monza,norris,9.0,8,7,0.96257,0.03743
267,2023,14,monza,alonso,10.0,9,8,0.966392,0.033608
268,2023,14,monza,piastri,7.0,12,9,0.974673,0.025327
269,2023,14,monza,albon,6.0,7,10,0.976813,0.023187


In [None]:
final_predictions_df.drop(columns=['actual', 'grid'])

In [None]:
# final_predictions_df[(final_predictions_df['podium'] == 1) | (final_predictions_df['predicted'] == 1)]
final_predictions_df[(final_predictions_df['podium'] == 1)]


In [None]:
final_predictions_df[(final_predictions_df['podium'] == 1) | (final_predictions_df['predicted'] == 1)]
