# Is podium


## Dependencies


The dependencies used are as follows


In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier, XGBRegressor, XGBRanker

import sys

sys.path.append("..")

from utils.visualization import model_prediction

import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.simplefilter("ignore")

## Initialization


We continue with the prediction of the model that predicts the podium


In [2]:
df_train = pd.read_csv("../assets/data/processed/final_model.csv")
df_train = df_train[df_train["raceYear"] >= 2019]

podiums = df_train[df_train["positionFinal"].isin([1, 2, 3])][
    ["raceYear", "raceRound", "driverRef"]
]
podiums = podiums.groupby(by=["raceYear", "raceRound"]).agg({"driverRef": ",".join})

X_train = pd.read_csv("../assets/data/processed/final_model_X.csv")
X_train = X_train.iloc[-len(df_train) :]

y_train = df_train.merge(
    podiums, how="left", on=["raceYear", "raceRound"], suffixes=("", "Podium")
)
y_train = y_train.apply(lambda x: x["driverRef"] in x["driverRefPodium"], axis=1)

In [3]:
df_test = pd.read_csv("../assets/data/processed/testing.csv")

columns = {"raceRound": df_test["raceRound"], "driverRef": df_test["driverRef"]}

podiums = df_test[df_test["positionFinal"].isin([1, 2, 3])][
    ["raceYear", "raceRound", "driverRef"]
]
podiums = podiums.groupby(by=["raceYear", "raceRound"]).agg({"driverRef": ",".join})

X_test = pd.read_csv("../assets/data/processed/testing_X.csv")

y_test = df_test.merge(
    podiums, how="left", on=["raceYear", "raceRound"], suffixes=("", "Podium")
)
y_test = y_test.apply(lambda x: x["driverRef"] in x["driverRefPodium"], axis=1)

## K-Nearest Neighbors


In [4]:
attributes = [
    "carNumber",
    "positionGrid",
    "driverNationality",
    "raceRound",
    "circuitLocation",
    "driverWins",
    "constructorWins",
    "weatherCold",
    "weatherWet",
    "circuitType",
    "circuitDirection",
    "driverPos7",
    "driverPos9",
    "driverPos10",
    "driverPos11",
    "driverPos12",
    "driverPos13",
    "driverPos14",
    "driverPos15",
    "driverPos16",
    "driverPos17",
    "driverRac",
]
knn = KNeighborsClassifier(n_neighbors=16, metric="cosine", weights="distance")
model_prediction(
    knn,
    X_train[attributes],
    y_train,
    X_test[attributes],
    y_test,
    columns=columns,
    multiclass=False,
)

Unnamed: 0,raceRound,driverRef,positionFinal,positionPred
0,1,max_verstappen,True,1
1,1,perez,True,1
2,1,sainz,True,0
3,1,leclerc,False,0
4,1,russell,False,1
5,1,norris,False,0
6,1,hamilton,False,1
7,1,piastri,False,0
8,1,alonso,False,0
9,1,stroll,False,0


## Decision Tree


In [5]:
attributes = [
    "positionGrid",
    "driverNationality",
    "grandPrix",
    "driverWins",
    "circuitLaps",
    "driverAgeStarted",
    "driverPos2",
    "driverPos4",
    "driverPos9",
]
tree = DecisionTreeClassifier(max_depth=1, criterion="entropy", splitter="best")
model_prediction(
    tree,
    X_train[attributes],
    y_train,
    X_test[attributes],
    y_test,
    columns=columns,
    multiclass=False,
)

Unnamed: 0,raceRound,driverRef,positionFinal,positionPred
0,1,max_verstappen,True,1
1,1,perez,True,1
2,1,sainz,True,1
3,1,leclerc,False,1
4,1,russell,False,1
5,1,norris,False,0
6,1,hamilton,False,0
7,1,piastri,False,0
8,1,alonso,False,0
9,1,stroll,False,0


## Random Forest


In [6]:
attributes = [
    "positionGrid",
    "driverNumber",
    "constructorRef",
    "raceYear",
    "driverWins",
    "constructorWins",
    "weatherDry",
    "driverAgeStarted",
    "driverPodiums",
    "driverPos2",
    "driverPos9",
    "driverPos10",
    "driverPos11",
    "driverPos12",
    "driverPos17",
    "driverExp",
    "driverRac",
]
random_forest = RandomForestClassifier(max_depth=8, n_estimators=436, criterion="gini")
model_prediction(
    random_forest,
    X_train[attributes],
    y_train,
    X_test[attributes],
    y_test,
    columns=columns,
    multiclass=False,
)

Unnamed: 0,raceRound,driverRef,positionFinal,positionPred
0,1,max_verstappen,True,1
1,1,perez,True,0
2,1,sainz,True,0
3,1,leclerc,False,1
4,1,russell,False,1
5,1,norris,False,0
6,1,hamilton,False,0
7,1,piastri,False,0
8,1,alonso,False,0
9,1,stroll,False,0


## Multilayer Perceptron


In [7]:
attributes = [
    "positionGrid",
    "driverRef",
    "driverNationality",
    "constructorNationality",
    "grandPrix",
    "circuitRef",
    "circuitLocation",
    "circuitLat",
    "constructorWins",
    "q1",
    "q2",
    "q3",
    "driverAgeToday",
    "weatherDry",
    "weatherWet",
    "circuitLength",
    "qMax",
    "qAvg",
    "driverPos6",
    "driverPos7",
    "driverPos10",
    "driverPos11",
    "driverPos15",
    "driverPos16",
    "driverExp",
    "driverRac",
    "driverPac",
]
nn = MLPClassifier(activation="logistic", hidden_layer_sizes=(33, 31, 41, 35, 38))
model_prediction(
    nn,
    X_train[attributes],
    y_train,
    X_test[attributes],
    y_test,
    columns=columns,
    multiclass=False,
)

Unnamed: 0,raceRound,driverRef,positionFinal,positionPred
0,1,max_verstappen,True,1
1,1,perez,True,0
2,1,sainz,True,0
3,1,leclerc,False,0
4,1,russell,False,0
5,1,norris,False,0
6,1,hamilton,False,0
7,1,piastri,False,0
8,1,alonso,False,0
9,1,stroll,False,0


## XGBClassifier


In [8]:
attributes = [
    "carNumber",
    "positionGrid",
    "driverNumber",
    "constructorRef",
    "circuitLat",
    "circuitLng",
    "circuitAlt",
    "circuitDist",
    "driverPodiums",
    "driverPos2",
    "driverPos15",
    "driverPos17",
    "driverOvr",
]
model = XGBClassifier(
    objective="binary:logistic",
    learning_rate=0.18294957954812197,
    n_estimators=351,
    max_depth=8,
    min_child_weight=4,
    gamma=0.29050118255571056,
    subsample=0.5806111623793364,
    colsample_bytree=0.7819840716854337,
    reg_alpha=3,
    reg_lambda=88,
)
model_prediction(
    model,
    X_train[attributes],
    y_train,
    X_test[attributes],
    y_test,
    columns=columns,
    multiclass=False,
)

Unnamed: 0,raceRound,driverRef,positionFinal,positionPred
0,1,max_verstappen,True,1
1,1,perez,True,0
2,1,sainz,True,0
3,1,leclerc,False,1
4,1,russell,False,1
5,1,norris,False,0
6,1,hamilton,False,0
7,1,piastri,False,0
8,1,alonso,False,0
9,1,stroll,False,0


## XGBRegressor


In [9]:
attributes = [
    "positionGrid",
    "constructorRef",
    "driverPos9",
    "driverOvr",
]
model = XGBRegressor(
    objective="reg:linear",
    learning_rate=0.1295066190445028,
    n_estimators=382,
    max_depth=19,
    min_child_weight=233,
    gamma=0.649204254388373,
    subsample=0.6713490871313639,
    colsample_bytree=0.5178145572350884,
    reg_alpha=0,
    reg_lambda=36,
)
model_prediction(
    model,
    X_train[attributes],
    y_train,
    X_test[attributes],
    y_test,
    columns=columns,
    multiclass=False,
)

Unnamed: 0,raceRound,driverRef,positionFinal,positionPred
0,1,max_verstappen,True,1
1,1,perez,True,1
2,1,sainz,True,1
3,1,leclerc,False,1
4,1,russell,False,1
5,1,norris,False,0
6,1,hamilton,False,0
7,1,piastri,False,0
8,1,alonso,False,0
9,1,stroll,False,0


## XGBRanker


In [10]:
races_per_year = np.cumsum(
    [0] + df_train.groupby("raceYear")["raceRound"].max().to_list()
)
set_id = lambda y, r: r + (races_per_year[y - 2019])

X_train["qid"] = df_train.apply(lambda x: set_id(x["raceYear"], x["raceRound"]), axis=1)
X_test["qid"] = df_test["raceRound"] + X_train["qid"].max()

attributes = [
    "carNumber",
    "positionGrid",
    "driverNumber",
    "driverNationality",
    "constructorRef",
    "constructorNationality",
    "raceYear",
    "raceRound",
    "circuitLocation",
    "circuitLat",
    "circuitLng",
    "constructorWins",
    "q2",
    "q3",
    "driverAgeToday",
    "driverAgeAtRace",
    "weatherWarm",
    "weatherCold",
    "weatherDry",
    "circuitLaps",
    "qMax",
    "driverExpYear",
    "driverExpRace",
    "driverPodiums",
    "driverPos2",
    "driverPos3",
    "driverPos4",
    "driverPos6",
    "driverPos14",
    "driverPos15",
    "driverPos17",
    "qid",
]
model = XGBRanker(
    objective="rank:pairwise",
    learning_rate=0.1283629986012046,
    n_estimators=391,
    max_depth=4,
    min_child_weight=42,
    gamma=0.8067325416889871,
    subsample=0.805170010669889,
    colsample_bytree=0.749726439873576,
    reg_alpha=5,
    reg_lambda=70,
)
model_prediction(
    model,
    X_train[attributes],
    y_train,
    X_test[attributes],
    y_test,
    columns=columns,
    rank=True,
    multiclass=False,
)

Unnamed: 0,raceRound,driverRef,positionFinal,positionPred
0,1,max_verstappen,True,1
1,1,perez,True,0
2,1,sainz,True,1
3,1,leclerc,False,0
4,1,russell,False,1
5,1,norris,False,0
6,1,hamilton,False,0
7,1,piastri,False,0
8,1,alonso,False,0
9,1,stroll,False,0


## Results


We can also observe how, in general, the percentage is met.
