In [4]:
import pandas as pd
import numpy as np

from metrics import print_metrics

from sklearn import tree
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import ParameterGrid

random_seed = 42

In [2]:
train = pd.read_csv("../data/featured/train.csv", sep=',')
val = pd.read_csv("../data/featured/val.csv", sep=',')

train['date'] = pd.to_datetime(train['date'])

Xtrain = train.drop(columns=['teamA_win'])
ytrain = train['teamA_win']

Xval = val.drop(columns=['teamA_win'])
yval = val['teamA_win']

display(Xtrain.shape, Xval.shape)

(4830, 55)

(209, 55)

In [3]:
min_date = Xtrain['date'].min()
max_date = Xtrain['date'].max()
sample_weight = (Xtrain['date'] - min_date) / (max_date - min_date)

Xtrain = Xtrain.drop(columns=['date'])
Xval = Xval.drop(columns=['date'])

In [None]:
param_grid = {
    'n_estimators': range(10,101,20),
    'learning_rate': [0.05, 0.1],
    'max_depth': [3,5] 
}

best_accuracy = 0
best_params = None

sample_weight = np.clip(sample_weight, 1e-6, 1.0)

for params in ParameterGrid(param_grid):
    clf = AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=params['max_depth']), 
        random_state=random_seed, 
        n_estimators=params['n_estimators'], 
        learning_rate=params['learning_rate']
    )
    clf.fit(Xtrain, ytrain, sample_weight=sample_weight)
    
    yval_pred = clf.predict(Xval)
    accuracy = accuracy_score(yval, yval_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print("Best Parameters:", best_params)
print("Best Validation Accuracy:", best_accuracy)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 70}
Best Validation Accuracy: 0.6985645933014354
