In [75]:
%matplotlib inline

import pandas as pd
import numpy as np
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss

sns.set_style('whitegrid')
sns.set_context('poster')

pd.set_option('display.max_columns', None)

np.random.seed(2131)

import warnings
warnings.filterwarnings('ignore')

** Load data files **

In [2]:
# load files ( DATA )

teams               = pd.read_csv('../data/Teams.csv')
seasons             = pd.read_csv('../data/Seasons.csv')
reg_season_compact  = pd.read_csv('../data/RegularSeasonCompactResults.csv')
reg_season_detailed = pd.read_csv('../data/RegularSeasonDetailedResults.csv')
tourney_compact     = pd.read_csv('../data/TourneyCompactResults.csv')
tourney_detailed    = pd.read_csv('../data/TourneyDetailedResults.csv')
tourney_seeds       = pd.read_csv('../data/TourneySeeds.csv')
tourney_slots       = pd.read_csv('../data/TourneySlots.csv')

In [54]:
original = pd.DataFrame({'Season':   reg_season_compact.Season, 
                     'Daynum':   reg_season_compact.Daynum, 
                     'Team1':    reg_season_compact.Wteam, 
                     'Team2':    reg_season_compact.Lteam,
                     'win_diff': reg_season_compact.Wscore - reg_season_compact.Lscore,
                     'target':   [1] * len(reg_season_compact)
                    })

In [57]:
created = pd.DataFrame({'Season':    reg_season_compact.Season,
                        'Daynum':    reg_season_compact.Daynum,
                        'Team1':     reg_season_compact.Lteam,
                        'Team2':     reg_season_compact.Wteam,
                        'win_diff':  reg_season_compact.Lscore - reg_season_compact.Wscore,
                        'target':    [0] * len(reg_season_compact)
                       })

In [61]:
data = pd.concat((original, created))
data = data.sort_values(by=['Season', 'Daynum'])

** Split into training and test set **

In [62]:
mask  = data.Season < 2013
train = data.loc[mask]
test  = data.loc[~mask]

In [125]:
features = ['Team1', 'Team2', 'Season']

X = train.loc[:, features]
y = train.target

Xtest = test.loc[:, features]

In [126]:
train_test_split_mask = X.Season < 2010

Xtr = X.loc[train_test_split_mask, features[:-1]]
ytr = y.loc[train_test_split_mask, ]

Xtv = X.loc[~train_test_split_mask, features[:-1]]
ytv = y.loc[~train_test_split_mask, ]

** XGboost **

In [127]:
n_round = 1000
dtrain  = xgb.DMatrix(Xtr, label=ytr)
dvalid  = xgb.DMatrix(Xtv, label=ytv)

params = {
    'eta': 0.2,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample': 1.0,
    'gamma': 1,
    'col_sample': 1.0,
    'eval_metric': 'logloss',
    'objective': 'binary:logistic',
    'nthread' : 6,
    'silent' : 0
}

evallist  = [(dvalid,'eval'), (dtrain,'train')]
model  = xgb.train(params, dtrain, evals=evallist, num_boost_round=n_round)

[0]	eval-logloss:0.692582	train-logloss:0.692422
[1]	eval-logloss:0.691935	train-logloss:0.692162
[2]	eval-logloss:0.691371	train-logloss:0.691541
[3]	eval-logloss:0.690769	train-logloss:0.690921
[4]	eval-logloss:0.690419	train-logloss:0.690534
[5]	eval-logloss:0.690093	train-logloss:0.690194
[6]	eval-logloss:0.689594	train-logloss:0.689744
[7]	eval-logloss:0.689076	train-logloss:0.689374
[8]	eval-logloss:0.688910	train-logloss:0.689180
[9]	eval-logloss:0.688673	train-logloss:0.688871
[10]	eval-logloss:0.688492	train-logloss:0.688655
[11]	eval-logloss:0.688320	train-logloss:0.688448
[12]	eval-logloss:0.688144	train-logloss:0.688324
[13]	eval-logloss:0.688002	train-logloss:0.688138
[14]	eval-logloss:0.687639	train-logloss:0.687757
[15]	eval-logloss:0.687274	train-logloss:0.687469
[16]	eval-logloss:0.687050	train-logloss:0.687076
[17]	eval-logloss:0.686825	train-logloss:0.686725
[18]	eval-logloss:0.686777	train-logloss:0.686592
[19]	eval-logloss:0.686794	train-logloss:0.686498
[20]	eval-

KeyboardInterrupt: 

In [93]:
model = xgb.train(params, dtrain, num_boost_round=100)
yhat  = model.predict(dvalid)

In [95]:
print('Log loss on validation set: {}'.format(log_loss(ytv, yhat)))

Log loss on validation set: 9.059946933120955e-06


** Training on full dataset **

In [None]:
dfull = xgb.DMatrix(X.loc[:, features[:-1]], y)

model = xgb.train(params, dfull, num_boost_round=100)

** Submission **

In [89]:
sub = pd.read_csv('../data/sample_submission.csv')

In [116]:
first  = sub.id.map(lambda x: x.split('_')[1])
second = sub.id.map(lambda x: x.split('_')[2])

In [137]:
first.head(1)

0    1103
Name: id, dtype: object

In [138]:
second.head(1)

0    1107
Name: id, dtype: object

In [141]:
Xtest.loc[(Xtest.Team1 == 1103) & (Xtest.Team2 == 1107)]

Unnamed: 0,Team1,Team2,Season


In [132]:
Xtest[(Xtest.Team1 == 1103) & (Xtest.Season == 2013)]

Unnamed: 0,Team1,Team2,Season
123897,1103,1157,2013
124185,1103,1329,2013
124204,1103,1421,2013
124326,1103,1336,2013
124988,1103,1292,2013
125239,1103,1166,2013
125323,1103,1178,2013
125437,1103,1115,2013
125696,1103,1156,2013
125720,1103,1411,2013


In [120]:
sub.head()

Unnamed: 0,id,pred
0,2013_1103_1107,0.5
1,2013_1103_1112,0.5
2,2013_1103_1125,0.5
3,2013_1103_1129,0.5
4,2013_1103_1137,0.5


In [119]:
first.head()

0    1103
1    1103
2    1103
3    1103
4    1103
Name: id, dtype: object

In [118]:
second.head()

0    1107
1    1112
2    1125
3    1129
4    1137
Name: id, dtype: object