# Final position in variable interval


## Dependencies


The dependencies used are as follows


In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

import sys

sys.path.append("..")

from utils.custom_cvs import VariableTimeSeriesSplit
from utils.custom_scorers import (
    balanced_accuracy_1interval_score,
    mean_absolute_1interval_error,
)

import textwrap
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.simplefilter("ignore")

## Initialization


First we will do the tuning of the model that predicts the final position of each driver at a ±1 interval.


In [2]:
df = pd.read_csv("../assets/data/processed/base_model.csv")

instances_per_year = df["raceYear"].value_counts(sort=False)
instances_per_half = (
    np.array(
        list(zip(np.floor(instances_per_year / 2), np.ceil(instances_per_year / 2)))
    )
    .flatten()
    .astype(np.int32)
)

n_splits = len(instances_per_half) - 10
max_train_size = [instances_per_half[i : 10 + i].sum() for i in range(n_splits)]
test_size = instances_per_half[10:].tolist()
tscv = VariableTimeSeriesSplit(
    n_splits=n_splits, max_train_size=max_train_size, test_size=test_size
)

X = pd.read_csv("../assets/data/processed/base_model_X.csv")
y = df["positionFinal"]

## K-Nearest Neighbors


In [3]:
grid = dict(
    n_neighbors=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 21, 31, 51, 101],
    weights=["uniform", "distance", None],
    metric=["euclidean", "manhattan", "cosine"],
)
search = GridSearchCV(
    KNeighborsClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"KNeighborsClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

KNeighborsClassifier: 0.28718245979609613 with {'metric': 'manhattan', 'n_neighbors':
	101, 'weights': 'distance'}


## Decision Tree


In [4]:
grid = dict(
    criterion=["gini", "entropy", "log_loss"],
    splitter=["best", "random"],
    max_depth=[2, 3, 4, 5, 6, 10, 20],
)
search = GridSearchCV(
    DecisionTreeClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"DecisionTreeClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

DecisionTreeClassifier: 0.3389971896790078 with {'criterion': 'entropy', 'max_depth': 5,
	'splitter': 'random'}


## Random Forest


In [5]:
grid = dict(
    n_estimators=[10, 30, 50, 100, 200],
    criterion=["gini", "entropy", "log_loss"],
    max_depth=[2, 3, 4, 5, 6, 10, 20],
)
search = GridSearchCV(
    RandomForestClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"RandomForestClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

RandomForestClassifier: 0.34932197852652397 with {'criterion': 'gini', 'max_depth': 5,
	'n_estimators': 50}


## Multilayer Perceptron


In [6]:
grid = dict(
    hidden_layer_sizes=[(100,), (50, 25), (50, 20, 5)],
    activation=["relu", "logistic"],
)
search = GridSearchCV(
    MLPClassifier(),
    grid,
    scoring=make_scorer(balanced_accuracy_1interval_score),
    cv=tscv,
    n_jobs=-1,
).fit(X, y)
output = f"MLPClassifier: {search.best_score_} with {search.best_params_}"
print("\n".join(textwrap.wrap(output, 88, subsequent_indent="\t")))

MLPClassifier: 0.35292989628216903 with {'activation': 'logistic', 'hidden_layer_sizes':
	(50, 25)}


## Results


After viewing several runs, the hyperparameters for each algorithm are as follows

- KNeighborsClassifier: 0.2863537094218912 with {'metric': 'manhattan', 'n_neighbors': 101, 'weights': 'uniform'}
- DecisionTreeClassifier: 0.34112213333804237 with {'criterion': 'gini', 'max_depth': 4, 'splitter': 'best'}
- RandomForestClassifier: 0.3479073809187445 with {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 200}
- MLPClassifier: 0.3586966547762002 with {'activation': 'logistic', 'hidden_layer_sizes': (50, 25)}
