# Is podium


## Dependencies


The dependencies used are as follows


In [1]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

from xgboost import XGBClassifier, XGBRegressor, XGBRanker

import sys

sys.path.append("..")

from utils.custom_cvs import VariableTimeSeriesSplit
from utils.custom_scorers import balanced_accuracy_score, balanced_accuracy_ranker

import textwrap
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.simplefilter("ignore")

## Initialization


We continue with the tuning of the model that predicts the podium


In [2]:
df = pd.read_csv("../assets/data/processed/other_models.csv")

mid_rc = df.groupby("raceYear")["raceRound"].max().to_numpy() // 2
get_half = lambda x: f'{x["raceYear"]}{x["raceRound"] <= mid_rc[x["raceYear"] - 2006]}'
instances_per_half = df.apply(get_half, axis=1).value_counts(sort=False).to_numpy()

n_splits = len(instances_per_half) - 10
max_train_size = [instances_per_half[i : 10 + i].sum() for i in range(n_splits)]
test_size = instances_per_half[10:].tolist()
tscv = VariableTimeSeriesSplit(
    n_splits=n_splits, max_train_size=max_train_size, test_size=test_size
)

podiums = df[df["positionFinal"].isin([1, 2, 3])][
    ["raceYear", "raceRound", "driverRef"]
]
podiums = podiums.groupby(by=["raceYear", "raceRound"]).agg({"driverRef": ",".join})

X = pd.read_csv("../assets/data/processed/other_models_X.csv")
y = df.merge(podiums, how="left", on=["raceYear", "raceRound"], suffixes=("", "Podium"))
y = y.apply(lambda x: int(x["driverRef"] in x["driverRefPodium"]), axis=1)

## XGBClassifier


In [3]:
grid = dict(
    learning_rate=[0.01, 0.1, 0.2],
    n_estimators=[50, 75, 150],
    max_depth=[3, 5, 10],
    min_child_weight=[1, 5, 15, 200],
    gamma=[0, 0.5, 0.75, 0.9],
    subsample=[0.5, 0.75, 0.9],
    colsample_bytree=[0.5, 0.75, 0.9],
    reg_alpha=[0, 3, 10],
    reg_lambda=[0, 3, 10],
)
search = RandomizedSearchCV(
    XGBClassifier(objective="binary:logistic"),
    grid,
    scoring=make_scorer(balanced_accuracy_score),
    cv=tscv,
    n_jobs=-1,
    n_iter=30,
).fit(X, y)
output = f"XGBClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

XGBClassifier: 0.7874969518034617 with {'subsample': 0.75, 'reg_lambda': 10,
	'reg_alpha': 3, 'n_estimators': 75, 'min_child_weight': 5, 'max_depth': 3,
	'learning_rate': 0.2, 'gamma': 0.5, 'colsample_bytree': 0.75}


## XGBRegressor


In [4]:
grid = dict(
    learning_rate=[0.01, 0.1, 0.2],
    n_estimators=[50, 75, 150],
    max_depth=[3, 5, 10],
    min_child_weight=[1, 5, 15, 200],
    gamma=[0, 0.5, 0.75, 0.9],
    subsample=[0.5, 0.75, 0.9],
    colsample_bytree=[0.5, 0.75, 0.9],
    reg_alpha=[0, 3, 10],
    reg_lambda=[0, 3, 10],
)
search = RandomizedSearchCV(
    XGBRegressor(objective="reg:linear"),
    grid,
    scoring=make_scorer(balanced_accuracy_score),
    cv=tscv,
    n_jobs=-1,
    n_iter=50,
).fit(X, y)
output = f"XGBRegressor: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

XGBRegressor: 0.7959400063566847 with {'subsample': 0.75, 'reg_lambda': 3, 'reg_alpha':
	0, 'n_estimators': 150, 'min_child_weight': 200, 'max_depth': 10, 'learning_rate': 0.1,
	'gamma': 0.75, 'colsample_bytree': 0.9}


## XGBRanker


In [5]:
grid = dict(
    learning_rate=[0.01, 0.1, 0.2],
    n_estimators=[50, 75, 150],
    max_depth=[3, 5, 10],
    min_child_weight=[1, 5, 15, 200],
    gamma=[0, 0.5, 0.75, 0.9],
    subsample=[0.5, 0.75, 0.9],
    colsample_bytree=[0.5, 0.75, 0.9],
    reg_alpha=[0, 3, 10],
    reg_lambda=[0, 3, 10],
)
search = RandomizedSearchCV(
    XGBRanker(objective="rank:pairwise"),
    grid,
    scoring=balanced_accuracy_ranker,
    cv=tscv,
    n_jobs=-1,
    n_iter=50,
).fit(X, y)
output = f"XGBRanker: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

XGBRanker: 0.8133024369850297 with {'subsample': 0.9, 'reg_lambda': 3, 'reg_alpha': 10,
	'n_estimators': 75, 'min_child_weight': 200, 'max_depth': 5, 'learning_rate': 0.01,
	'gamma': 0, 'colsample_bytree': 0.75}


## Results


After viewing several runs, the hyperparameters for each algorithm are as follows

- XGBClassifier: 0.7874969518034617 with {'subsample': 0.75, 'reg_lambda': 10, 'reg_alpha': 3, 'n_estimators': 75, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.2, 'gamma': 0.5, 'colsample_bytree': 0.75}
- XGBRegressor: 0.7959400063566847 with {'subsample': 0.75, 'reg_lambda': 3, 'reg_alpha': 0, 'n_estimators': 150, 'min_child_weight': 200, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.75, 'colsample_bytree': 0.9}
- XGBRanker: 0.8133024369850297 with {'subsample': 0.9, 'reg_lambda': 3, 'reg_alpha': 10, 'n_estimators': 75, 'min_child_weight': 200, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.75}
