In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
base_path = './data'
path_player_away_train = f'{base_path}/Train_Data/train_away_player_statistics_df.csv'
path_player_home_train = f'{base_path}/Train_Data/train_home_player_statistics_df.csv'
path_team_away_train = f'{base_path}/Train_Data/train_away_team_statistics_df.csv'
path_team_home_train = f'{base_path}/Train_Data/train_home_team_statistics_df.csv'
path_target = f'{base_path}/Y_train_1rknArQ.csv'
path_target_sup = f'{base_path}/benchmark_and_extras/Y_train_supp.csv'

In [3]:
train_team_home = pd.read_csv(path_team_home_train, sep=',')
train_team_away = pd.read_csv(path_team_away_train, sep=',')

train_player_away = pd.read_csv(path_player_away_train, sep=',')
train_player_home = pd.read_csv(path_player_home_train, sep=',')

train_target = pd.read_csv(path_target, sep=',')
train_target_sup = pd.read_csv(path_target_sup, sep=',')

In [4]:
train_team_home.columns = 'HOME_' + train_team_home.columns
train_team_away.columns = 'AWAY_' + train_team_away.columns

In [5]:
train_team = pd.merge(train_team_home, train_team_away, left_on='HOME_ID', right_on='AWAY_ID', how='inner')

train_team['ID'] = train_team['HOME_ID'].combine_first(train_team['AWAY_ID'])

train_team = train_team.drop(columns=['HOME_ID', 'AWAY_ID'])

train_team.insert(0, 'ID', train_team.pop('ID'))
train_team.head()

Unnamed: 0,ID,HOME_LEAGUE,HOME_TEAM_NAME,HOME_TEAM_SHOTS_TOTAL_season_sum,HOME_TEAM_SHOTS_INSIDEBOX_season_sum,HOME_TEAM_SHOTS_OFF_TARGET_season_sum,HOME_TEAM_SHOTS_ON_TARGET_season_sum,HOME_TEAM_SHOTS_OUTSIDEBOX_season_sum,HOME_TEAM_PASSES_season_sum,HOME_TEAM_SUCCESSFUL_PASSES_season_sum,...,AWAY_TEAM_YELLOWCARDS_5_last_match_std,AWAY_TEAM_REDCARDS_5_last_match_std,AWAY_TEAM_OFFSIDES_5_last_match_std,AWAY_TEAM_ATTACKS_5_last_match_std,AWAY_TEAM_PENALTIES_5_last_match_std,AWAY_TEAM_SUBSTITUTIONS_5_last_match_std,AWAY_TEAM_BALL_SAFE_5_last_match_std,AWAY_TEAM_DANGEROUS_ATTACKS_5_last_match_std,AWAY_TEAM_INJURIES_5_last_match_std,AWAY_TEAM_GOALS_5_last_match_std
0,0,Ligue 1,Toulouse,3.0,2.0,5.0,2.0,1.0,2.0,2.0,...,5.0,5.0,4.0,0.0,6.0,8.0,4.0,3.0,2.0,3.0
1,1,Ligue 2,Brest,6.0,8.0,3.0,6.0,5.0,8.0,7.0,...,0.0,0.0,3.0,1.0,8.0,4.0,10.0,0.0,5.0,3.0
2,2,Serie A,Sampdoria,4.0,2.0,5.0,2.0,8.0,1.0,1.0,...,6.0,10.0,4.0,4.0,0.0,8.0,3.0,0.0,9.0,6.0
3,3,League One,Coventry City,7.0,5.0,5.0,6.0,6.0,9.0,9.0,...,0.0,0.0,1.0,2.0,0.0,5.0,6.0,3.0,,2.0
4,4,Premier League,Wolverhampton Wanderers,3.0,3.0,2.0,3.0,4.0,4.0,3.0,...,1.0,0.0,4.0,4.0,9.0,4.0,1.0,4.0,6.0,5.0


In [6]:
train_target = train_target.drop(columns='ID')

In [7]:
encoded_target = train_target.apply(lambda x: np.where(x.to_numpy() == 1)[0][0], axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    train_team.drop(columns=['ID', 'HOME_LEAGUE', 'HOME_TEAM_NAME', 'AWAY_LEAGUE', 'AWAY_TEAM_NAME']), 
    encoded_target, 
    test_size=0.2, 
    random_state=42
    )

In [9]:
xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=3
)

param_grid = {
    'max_depth': [3, 6, 12],
    'eta': [0.01, 0.1],
    'reg_lambda': [0, 0.5, 1]
}

grid_search = GridSearchCV(xgb, param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train, y_train)

In [11]:
grid_search.score(X_test, y_test)

0.49207639171068673

In [12]:
path_team_away_test = f'{base_path}/Test_Data/test_away_team_statistics_df.csv'
path_team_home_test = f'{base_path}/Test_Data/test_home_team_statistics_df.csv'

test_team_home = pd.read_csv(path_team_home_test, sep=',')
test_team_away = pd.read_csv(path_team_away_test, sep=',')

test_team_home.columns = 'HOME_' + test_team_home.columns
test_team_away.columns = 'AWAY_' + test_team_away.columns

test_team =  pd.concat([test_team_home, test_team_away.iloc[:, 1:]],join='inner',axis=1)
test_team = test_team.rename(columns={'HOME_ID':'ID'})

In [13]:
predictions = grid_search.predict(test_team.drop(columns=['ID']))

In [14]:
p = []
for pred in predictions:
    _p = np.zeros(3, dtype=np.int32)
    _p[pred] = 1
    p.append(_p)

In [15]:
pred_sub = pd.DataFrame(data=p, columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])

In [16]:
pred_sub = pd.concat([test_team['ID'], pred_sub],join='inner',axis=1)

In [None]:
pred_sub.to_csv('./submission_xgboost.csv', index=False)