# Is podium


## Dependencies


The dependencies used are as follows


In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

import sys

sys.path.append("..")

from utils.custom_cvs import VariableTimeSeriesSplit
from utils.custom_scorers import balanced_accuracy_score

import textwrap
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.simplefilter("ignore")

## Initialization


We continue with the tuning of the model that predicts the podium


In [2]:
df = pd.read_csv("../assets/data/processed/adding_data.csv")

mid_rc = df.groupby("raceYear")["raceRound"].max().to_numpy() // 2
get_half = lambda x: f'{x["raceYear"]}{x["raceRound"] <= mid_rc[x["raceYear"] - 2006]}'
instances_per_half = df.apply(get_half, axis=1).value_counts(sort=False).to_numpy()

n_splits = len(instances_per_half) - 10
max_train_size = [instances_per_half[i : 10 + i].sum() for i in range(n_splits)]
test_size = instances_per_half[10:].tolist()
tscv = VariableTimeSeriesSplit(
    n_splits=n_splits, max_train_size=max_train_size, test_size=test_size
)

podiums = df[df["positionFinal"].isin([1, 2, 3])][
    ["raceYear", "raceRound", "driverRef"]
]
podiums = podiums.groupby(by=["raceYear", "raceRound"]).agg({"driverRef": ",".join})

X = pd.read_csv("../assets/data/processed/adding_data_X.csv")
y = df.merge(podiums, how="left", on=["raceYear", "raceRound"], suffixes=("", "Podium"))
y = y.apply(lambda x: int(x["driverRef"] in x["driverRefPodium"]), axis=1)

## K-Nearest Neighbors


In [3]:
grid = dict(
    n_neighbors=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 21, 31, 51, 101],
    weights=["uniform", "distance", None],
    metric=["euclidean", "manhattan", "cosine"],
)
search = GridSearchCV(
    KNeighborsClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"KNeighborsClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

KNeighborsClassifier: 0.7380751972783101 with {'metric': 'cosine', 'n_neighbors': 6,
	'weights': 'distance'}


## Decision Tree


In [4]:
grid = dict(
    criterion=["gini", "entropy", "log_loss"],
    splitter=["best", "random"],
    max_depth=[2, 3, 4, 5, 6, 10, 20],
)
search = GridSearchCV(
    DecisionTreeClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"DecisionTreeClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

DecisionTreeClassifier: 0.783119412032717 with {'criterion': 'entropy', 'max_depth': 4,
	'splitter': 'best'}


## Random Forest


In [5]:
grid = dict(
    n_estimators=[10, 30, 50, 100, 200],
    criterion=["gini", "entropy", "log_loss"],
    max_depth=[2, 3, 4, 5, 6, 10, 20],
)
search = GridSearchCV(
    RandomForestClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"RandomForestClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

RandomForestClassifier: 0.7665285291056751 with {'criterion': 'gini', 'max_depth': 10,
	'n_estimators': 200}


## Multilayer Perceptron


In [6]:
grid = dict(
    hidden_layer_sizes=[(100,), (50, 25), (50, 20, 5)],
    activation=["relu", "logistic"],
)
search = GridSearchCV(
    MLPClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"MLPClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

MLPClassifier: 0.7845751288896926 with {'activation': 'logistic', 'hidden_layer_sizes':
	(50, 20, 5)}


## Results


After viewing several runs, the hyperparameters for each algorithm are as follows

- KNeighborsClassifier: 0.7380751972783101 with {'metric': 'cosine', 'n_neighbors': 6, 'weights': 'distance'}
- DecisionTreeClassifier: 0.783119412032717 with {'criterion': 'entropy', 'max_depth': 4, 'splitter': 'best'}
- RandomForestClassifier: 0.7665285291056751 with {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 200}
- MLPClassifier: 0.7845751288896926 with {'activation': 'logistic', 'hidden_layer_sizes': (50, 20, 5)}
