In [1]:
import math

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

random_seed = 42

In [2]:
data_ws = pd.read_csv("../data/featured/winter_spring.csv", sep=';')
data_s = pd.read_csv("../data/featured/spring.csv", sep=';')

Xdata_ws = data_ws.drop('teamA_win', axis=1)
ydata_ws = data_ws.teamA_win

Xdata_s = data_s.drop('teamA_win', axis=1)
ydata_s = data_s.teamA_win

In [3]:
Xtrain_ws, Xval_ws, ytrain_ws, yval_ws = train_test_split(Xdata_ws, ydata_ws, test_size=0.4, random_state=random_seed)
Xtrain_s, Xval_s, ytrain_s, yval_s = train_test_split(Xdata_s, ydata_s, test_size=0.4, random_state=random_seed)

test_data = pd.read_csv("../data/featured/summer.csv", sep=';')
Xtest = test_data.drop('teamA_win', axis=1)
ytest = test_data.teamA_win

In [4]:
clfRF = RandomForestClassifier(n_estimators = 20, max_depth = 8, random_state = 42)
clfRF.fit(Xtrain_ws, ytrain_ws)

print('Winter & Spring Accuracy score (train): {0:.6f}'.format(metrics.accuracy_score(ytrain_ws, clfRF.predict(Xtrain_ws))))
print('Winter & Spring Accuracy score (val): {0:.6f}'.format(metrics.accuracy_score(yval_ws, clfRF.predict(Xval_ws))))

clfRF = RandomForestClassifier(n_estimators = 20, max_depth = 8, random_state = 42)
clfRF.fit(Xtrain_s, ytrain_s)

print('\nSpring Accuracy score (train): {0:.6f}'.format(metrics.accuracy_score(ytrain_s, clfRF.predict(Xtrain_s))))
print('Spring Accuracy score (val): {0:.6f}'.format(metrics.accuracy_score(yval_s, clfRF.predict(Xval_s))))

Winter & Spring Accuracy score (train): 0.835549
Winter & Spring Accuracy score (val): 0.730325

Spring Accuracy score (train): 0.889730
Spring Accuracy score (val): 0.703404


In [5]:
param_grid = {
    'n_estimators': range(3,50),
    'max_depth': range(3, 30)
}

param_comb = ParameterGrid(param_grid)

val_metric = []
for params in param_comb:
    clf = RandomForestClassifier(**params, random_state = 42).fit(Xtrain_ws, ytrain_ws)
    val_metric.append(metrics.accuracy_score(yval_ws, clf.predict(Xval_ws)))
    
best_params = param_comb[np.argmax(val_metric)]
print(f"We found the best params {best_params} with validation acuraccy {max(val_metric):.4f}.")

clfRFb = RandomForestClassifier(**best_params, random_state = 42).fit(Xtrain_ws, ytrain_ws)
print('Winter & Spring accuracy score (test): {0:.6f}'.format(metrics.accuracy_score(ytest, clf.predict(Xtest))))

We found the best params {'n_estimators': 26, 'max_depth': 19} with validation acuraccy 0.7754.
Winter & Spring accuracy score (test): 0.659148


In [6]:
param_grid = {
    'n_estimators': range(3,50),
    'max_depth': range(3, 30)
}

param_comb = ParameterGrid(param_grid)

val_metric = []
for params in param_comb:
    clf = RandomForestClassifier(**params, random_state = 42).fit(Xtrain_s, ytrain_s)
    val_metric.append(metrics.accuracy_score(yval_s, clf.predict(Xval_s)))
    
best_params = param_comb[np.argmax(val_metric)]
print(f"We found the best params {best_params} with validation acuraccy {max(val_metric):.4f}.")

clfRFb = RandomForestClassifier(**best_params, random_state = 42).fit(Xtrain_s, ytrain_s)
print('Spring accuracy score (test): {0:.6f}'.format(metrics.accuracy_score(ytest, clf.predict(Xtest))))

We found the best params {'n_estimators': 34, 'max_depth': 18} with validation acuraccy 0.7536.
Spring accuracy score (test): 0.674185


In [5]:
matches = pd.read_csv("../data/cleaned/matches_winter.csv", sep=';')
teams = pd.read_csv("../data/cleaned/teams_winter_daily.csv", sep=';')

In [None]:
import pandas as pd

def get_closest_past_stats(team_name, match_date, stats_df):
    team_data = stats_df[(stats_df["Team"] == team_name) & (stats_df["date"] < match_date)]
    if team_data.empty:
        return pd.Series(dtype=float)
    return team_data.sort_values("date", ascending=False).iloc[0]

def prefix_columns(series, prefix, ignore_cols):
    renamed = {}
    for col in series.index:
        if col in ignore_cols:
            renamed[col] = col
        else:
            renamed[col] = f"{prefix}{col}"
    return series.rename(index=renamed)

def merge_match_with_team_stats(matches_df, teams_df):
    matches_df = matches_df.copy()
    teams_df = teams_df.copy()

    matches_df["date"] = pd.to_datetime(matches_df["date"])
    teams_df["date"] = pd.to_datetime(teams_df["date"])

    merged_rows = []

    ignore_cols = {"Team", "tournament", "date"}

    for _, row in matches_df.iterrows():
        teamA = row["teamA"]
        teamB = row["teamB"]
        match_date = row["date"]

        stats_A = get_closest_past_stats(teamA, match_date, teams_df)
        stats_B = get_closest_past_stats(teamB, match_date, teams_df)

        stats_A = prefix_columns(stats_A, "A_", ignore_cols)
        stats_B = prefix_columns(stats_B, "B_", ignore_cols)

        combined_row = pd.concat([row, stats_A, stats_B])
        merged_rows.append(combined_row)

    return pd.DataFrame(merged_rows)


In [30]:
def matchesAndTeamsMerge(matches, teams):
    data = matches.merge(teams, how='left', left_on='teamA', right_on='Name')
    data = data.rename(columns=lambda x: x if x in ['teamA','teamB','teamA_win','match_id','game_in_series','date','split'] else f"{x}_A")
    data = data.drop(columns=['Name_A'], errors='ignore')

    data = data.merge(teams, how='left', left_on='teamB', right_on='Name')
    data = data.rename(columns=lambda x: x if x in ['teamA','teamB','teamA_win','match_id','game_in_series','date','split'] or x.endswith('_A') else f"{x}_B")
    data = data.drop(columns=['Name_B'], errors='ignore')
    return data

In [31]:
df = merge_match_with_team_stats(matches, teams)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [23]:
df.shape

(469, 71)

In [24]:
teams.shape

(957, 32)

In [25]:
matches.shape

(469, 7)

In [26]:
df.columns

Index(['tournament', 'match_id', 'game_in_series', 'teamA', 'teamB',
       'teamA_win', 'date', 'A_tournament', 'A_date', 'A_Team', 'A_GP', 'A_W',
       'A_L', 'A_AGT', 'A_K', 'A_D', 'A_KD', 'A_CKPM', 'A_GPR', 'A_GSPD',
       'A_EGR', 'A_MLR', 'A_GD15', 'A_FB%', 'A_FT%', 'A_F3T%', 'A_PPG',
       'A_HLD%', 'A_GRB%', 'A_FD%', 'A_DRG%', 'A_ELD%', 'A_FBN%', 'A_BN%',
       'A_LNE%', 'A_JNG%', 'A_WPM', 'A_CWPM', 'A_WCPM', 'B_tournament',
       'B_date', 'B_Team', 'B_GP', 'B_W', 'B_L', 'B_AGT', 'B_K', 'B_D', 'B_KD',
       'B_CKPM', 'B_GPR', 'B_GSPD', 'B_EGR', 'B_MLR', 'B_GD15', 'B_FB%',
       'B_FT%', 'B_F3T%', 'B_PPG', 'B_HLD%', 'B_GRB%', 'B_FD%', 'B_DRG%',
       'B_ELD%', 'B_FBN%', 'B_BN%', 'B_LNE%', 'B_JNG%', 'B_WPM', 'B_CWPM',
       'B_WCPM'],
      dtype='object')