# Final position in variable interval


## Dependencies


The dependencies used are as follows


In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

from xgboost import XGBClassifier, XGBRegressor, XGBRanker

import sys

sys.path.append("..")

from utils.custom_cvs import VariableTimeSeriesSplit
from utils.custom_scorers import (
    balanced_accuracy_1interval_score,
    balanced_accuracy_1interval_ranker,
)

import textwrap
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.simplefilter("ignore")

## Initialization


First we will do the tuning of the model that predicts the final position of each driver at a ±1 interval.


In [2]:
df = pd.read_csv("../assets/data/processed/final_model.csv")

mid_rc = df.groupby("raceYear")["raceRound"].max().to_numpy() // 2
get_half = lambda x: f'{x["raceYear"]}{x["raceRound"] <= mid_rc[x["raceYear"] - 2006]}'
instances_per_half = df.apply(get_half, axis=1).value_counts(sort=False).to_numpy()

n_splits = len(instances_per_half) - 10
max_train_size = [instances_per_half[i : 10 + i].sum() for i in range(n_splits)]
test_size = instances_per_half[10:].tolist()
tscv = VariableTimeSeriesSplit(
    n_splits=n_splits, max_train_size=max_train_size, test_size=test_size
)

X = pd.read_csv("../assets/data/processed/final_model_X.csv")
y = df["positionFinal"]

## K-Nearest Neighbors


In [3]:
grid = dict(
    n_neighbors=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 21, 31, 51, 101],
    weights=["uniform", "distance", None],
    metric=["euclidean", "manhattan", "cosine"],
)
search = GridSearchCV(
    KNeighborsClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"KNeighborsClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

KNeighborsClassifier: 0.3356910953253832 with {'metric': 'manhattan', 'n_neighbors': 15,
	'weights': 'distance'}


## Decision Tree


In [4]:
grid = dict(
    criterion=["gini", "entropy", "log_loss"],
    splitter=["best", "random"],
    max_depth=[2, 3, 4, 5, 6, 10, 20],
)
search = GridSearchCV(
    DecisionTreeClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"DecisionTreeClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

DecisionTreeClassifier: 0.3792287733670603 with {'criterion': 'gini', 'max_depth': 4,
	'splitter': 'best'}


## Random Forest


In [5]:
grid = dict(
    n_estimators=[10, 30, 50, 100, 200],
    criterion=["gini", "entropy", "log_loss"],
    max_depth=[2, 3, 4, 5, 6, 10, 20],
)
search = GridSearchCV(
    RandomForestClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"RandomForestClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

RandomForestClassifier: 0.40066126402168717 with {'criterion': 'log_loss', 'max_depth':
	6, 'n_estimators': 200}


## Multilayer Perceptron


In [6]:
grid = dict(
    hidden_layer_sizes=[(100,), (50, 25), (50, 20, 5)],
    activation=["relu", "logistic"],
)
search = GridSearchCV(
    MLPClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"MLPClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

MLPClassifier: 0.40441382956282335 with {'activation': 'logistic', 'hidden_layer_sizes':
	(50, 25)}


## XGBClassifier


In [7]:
grid = dict(
    learning_rate=[0.01, 0.1, 0.2],
    n_estimators=[50, 75, 150],
    max_depth=[3, 5, 10],
    min_child_weight=[1, 5, 15, 200],
    gamma=[0, 0.5, 0.75, 0.9],
    subsample=[0.5, 0.75, 0.9],
    colsample_bytree=[0.5, 0.75, 0.9],
    reg_alpha=[0, 3, 10],
    reg_lambda=[0, 3, 10],
)
search = RandomizedSearchCV(
    XGBClassifier(objective="multi:softmax"),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
    n_iter=30,
).fit(X, y - 1)
output = f"XGBClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

XGBClassifier: 0.40849538130046387 with {'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha':
	3, 'n_estimators': 150, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1,
	'gamma': 0.5, 'colsample_bytree': 0.75}


## XGBRegressor


In [8]:
grid = dict(
    learning_rate=[0.01, 0.1, 0.2],
    n_estimators=[50, 75, 150],
    max_depth=[3, 5, 10],
    min_child_weight=[1, 5, 15, 200],
    gamma=[0, 0.5, 0.75, 0.9],
    subsample=[0.5, 0.75, 0.9],
    colsample_bytree=[0.5, 0.75, 0.9],
    reg_alpha=[0, 3, 10],
    reg_lambda=[0, 3, 10],
)
search = RandomizedSearchCV(
    XGBRegressor(objective="reg:linear"),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
    n_iter=50,
).fit(X, y - 1)
output = f"XGBRegressor: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

XGBRegressor: 0.390913353764876 with {'subsample': 0.9, 'reg_lambda': 10, 'reg_alpha':
	0, 'n_estimators': 50, 'min_child_weight': 200, 'max_depth': 5, 'learning_rate': 0.2,
	'gamma': 0.75, 'colsample_bytree': 0.9}


## XGBRanker


In [9]:
races_per_year = np.cumsum([0] + df.groupby("raceYear")["raceRound"].max().to_list())
set_id = lambda y, r: r + (races_per_year[y - 2006])

X["qid"] = df.apply(lambda x: set_id(x["raceYear"], x["raceRound"]), axis=1)

grid = dict(
    learning_rate=[0.01, 0.1, 0.2],
    n_estimators=[50, 75, 150],
    max_depth=[3, 5, 10],
    min_child_weight=[1, 5, 15, 200],
    gamma=[0, 0.5, 0.75, 0.9],
    subsample=[0.5, 0.75, 0.9],
    colsample_bytree=[0.5, 0.75, 0.9],
    reg_alpha=[0, 3, 10],
    reg_lambda=[0, 3, 10],
)
search = RandomizedSearchCV(
    XGBRanker(objective="rank:pairwise"),
    grid,
    scoring=balanced_accuracy_1interval_ranker,
    cv=tscv,
    n_jobs=-1,
    n_iter=30,
).fit(X, y)
output = f"XGBRanker: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

XGBRanker: 0.5374805117864973 with {'subsample': 0.75, 'reg_lambda': 3, 'reg_alpha': 3,
	'n_estimators': 75, 'min_child_weight': 15, 'max_depth': 5, 'learning_rate': 0.1,
	'gamma': 0.9, 'colsample_bytree': 0.9}


## Results


After viewing several runs, the hyperparameters for each algorithm are as follows

- KNeighborsClassifier: 0.3356910953253832 with {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
- DecisionTreeClassifier: 0.3792287733670603 with {'criterion': 'gini', 'max_depth': 4, 'splitter': 'best'}
- RandomForestClassifier: 0.40066126402168717 with {'criterion': 'log_loss', 'max_depth': 6, 'n_estimators': 200}
- MLPClassifier: 0.40441382956282335 with {'activation': 'logistic', 'hidden_layer_sizes': (50, 25)}
- XGBClassifier: 0.40849538130046387 with {'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha': 3, 'n_estimators': 150, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 0.75}
- XGBRegressor: 0.390913353764876 with {'subsample': 0.9, 'reg_lambda': 10, 'reg_alpha': 0, 'n_estimators': 50, 'min_child_weight': 200, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 0.75, 'colsample_bytree': 0.9}
- XGBRanker: 0.5374805117864973 with {'subsample': 0.75, 'reg_lambda': 3, 'reg_alpha': 3, 'n_estimators': 75, 'min_child_weight': 15, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.9, 'colsample_bytree': 0.9}
