In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.impute import SimpleImputer

In [3]:
base_path = './data'
path_target = f'{base_path}/Y_train_1rknArQ.csv'

In [4]:
X_team = pd.read_csv('./train_team.csv')
target = pd.read_csv(path_target, sep=',')

In [5]:
encoded_target = target.drop(columns='ID').apply(lambda x: np.where(x.to_numpy() == 1)[0][0], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_team.drop(columns=['ID', 'HOME_LEAGUE', 'HOME_TEAM_NAME', 'AWAY_LEAGUE', 'AWAY_TEAM_NAME']), 
    encoded_target, 
    test_size=0.2, 
    random_state=42
    )

In [7]:
imputer = SimpleImputer(strategy='median').fit(X_train)
X_train = imputer.transform(X_train)

In [8]:
xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eta=0.1,
    n_estimators=100
)

param_grid = {
    'max_depth': [3, 6, 12],
    'reg_lambda': [0, 0.1, 0.5, 1, 2, 5],
    'reg_alpha': [0, 0.1, 0.5, 1, 2, 5]
}

grid_search = GridSearchCV(xgb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=10).fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 2/5; 1/108] START max_depth=3, reg_alpha=0, reg_lambda=0....................
[CV 1/5; 1/108] START max_depth=3, reg_alpha=0, reg_lambda=0....................
[CV 4/5; 1/108] START max_depth=3, reg_alpha=0, reg_lambda=0....................
[CV 5/5; 1/108] START max_depth=3, reg_alpha=0, reg_lambda=0....................
[CV 1/5; 2/108] START max_depth=3, reg_alpha=0, reg_lambda=0.1..................
[CV 3/5; 1/108] START max_depth=3, reg_alpha=0, reg_lambda=0....................
[CV 3/5; 2/108] START max_depth=3, reg_alpha=0, reg_lambda=0.1..................
[CV 2/5; 2/108] START max_depth=3, reg_alpha=0, reg_lambda=0.1..................
[CV 5/5; 1/108] END max_depth=3, reg_alpha=0, reg_lambda=0;, score=0.483 total time=   3.5s
[CV 4/5; 1/108] END max_depth=3, reg_alpha=0, reg_lambda=0;, score=0.471 total time=   3.6s
[CV 4/5; 2/108] START max_depth=3, reg_alpha=0, reg_lambda=0.1..................
[CV 3/5; 2/108] END max_



[CV 2/5; 29/108] END max_depth=3, reg_alpha=2, reg_lambda=2;, score=0.490 total time=   3.7s
[CV 3/5; 29/108] END max_depth=3, reg_alpha=2, reg_lambda=2;, score=0.490 total time=   3.4s
[CV 2/5; 30/108] START max_depth=3, reg_alpha=2, reg_lambda=5...................
[CV 3/5; 30/108] START max_depth=3, reg_alpha=2, reg_lambda=5...................
[CV 4/5; 29/108] END max_depth=3, reg_alpha=2, reg_lambda=2;, score=0.474 total time=   3.5s
[CV 2/5; 31/108] START max_depth=3, reg_alpha=5, reg_lambda=0...................
[CV 1/5; 30/108] END max_depth=3, reg_alpha=2, reg_lambda=5;, score=0.491 total time=   3.2s
[CV 3/5; 31/108] START max_depth=3, reg_alpha=5, reg_lambda=0...................
[CV 1/5; 31/108] START max_depth=3, reg_alpha=5, reg_lambda=0...................
[CV 5/5; 30/108] START max_depth=3, reg_alpha=2, reg_lambda=5...................
[CV 4/5; 30/108] START max_depth=3, reg_alpha=2, reg_lambda=5...................
[CV 5/5; 29/108] END max_depth=3, reg_alpha=2, reg_lambda=2;,

  _data = np.array(data, dtype=dtype, copy=copy,


In [9]:
print(grid_search.score(X_test, y_test))
print(grid_search.best_params_)

0.49573344169036976
{'max_depth': 3, 'reg_alpha': 5, 'reg_lambda': 2}


In [10]:
xgb_ajusted = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=100,
    max_depth=3,
    reg_alpha=5,
    reg_lambda=2
)

param_grid = {
    'eta': np.arange(0.01, 1, 0.05)
}

grid_search = GridSearchCV(xgb_ajusted, param_grid=param_grid, cv=5, n_jobs=-1, verbose=10).fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5; 1/20] START eta=0.01...................................................
[CV 2/5; 1/20] START eta=0.01...................................................
[CV 3/5; 1/20] START eta=0.01...................................................
[CV 4/5; 1/20] START eta=0.01...................................................
[CV 5/5; 1/20] START eta=0.01...................................................
[CV 1/5; 2/20] START eta=0.060000000000000005...................................
[CV 2/5; 2/20] START eta=0.060000000000000005...................................
[CV 3/5; 2/20] START eta=0.060000000000000005...................................
[CV 5/5; 1/20] END ....................eta=0.01;, score=0.484 total time=   3.7s
[CV 4/5; 2/20] START eta=0.060000000000000005...................................
[CV 2/5; 2/20] END ....eta=0.060000000000000005;, score=0.495 total time=   3.7s
[CV 5/5; 2/20] START eta=0.060000000000000005..

In [11]:
print(grid_search.score(X_test, y_test))
print(grid_search.best_params_)

0.49695245835026414
{'eta': np.float64(0.060000000000000005)}


In [None]:
final_xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=100,
    max_depth=3,
    reg_alpha=2,
    reg_lambda=0.5,
    eta=0.11
).fit(X_team.drop(columns=['ID', 'HOME_LEAGUE', 'HOME_TEAM_NAME', 'AWAY_LEAGUE', 'AWAY_TEAM_NAME']), encoded_target)

In [25]:
path_team_away_test = f'{base_path}/Test_Data/test_away_team_statistics_df.csv'
path_team_home_test = f'{base_path}/Test_Data/test_home_team_statistics_df.csv'

test_team_home = pd.read_csv(path_team_home_test, sep=',')
test_team_away = pd.read_csv(path_team_away_test, sep=',')

test_team_home.columns = 'HOME_' + test_team_home.columns
test_team_away.columns = 'AWAY_' + test_team_away.columns

test_team =  pd.concat([test_team_home, test_team_away.iloc[:, 1:]],join='inner',axis=1)
test_team = test_team.rename(columns={'HOME_ID':'ID'})

In [27]:
predictions = final_xgb.predict(test_team.drop(columns=['ID']))

In [29]:
p = []
for pred in predictions:
    _p = np.zeros(3, dtype=np.int32)
    _p[pred] = 1
    p.append(_p)

In [30]:
pred_sub = pd.DataFrame(data=p, columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])

In [31]:
pred_sub = pd.concat([test_team['ID'], pred_sub],join='inner',axis=1)

In [32]:
pred_sub.to_csv('./submission_xgboost.csv', index=False)