In [1]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import lightgbm
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibratedClassifierCV

In [2]:
dir = ""

In [3]:
wl_odds_player_df = pd.read_csv(dir + "wl_odds_player_df.csv")

wl_odds_player_df = wl_odds_player_df.drop('Unnamed: 0', axis=1)

In [5]:
wl_odds_player_df.columns

Index(['league_id', 'season', 'date', 'stage', 'match_id', 'team_id',
       'm_rating', 'win', 'lose', 'rating_diff', 'home', 'cumulative_win',
       'cumulative_lose', 'match_num', 'match_bin', 'draw',
       'avg_overall_rating', 'avg_volleys_imp', 'avg_long_passing',
       'avg_finishing', 'avg_ball_control', 'avg_dribbling',
       'avg_short_passing', 'avg_reactions', 'avg_vision_imp', 'avg_penalties',
       'avg_agility_imp', 'result', 'cumulative_result'],
      dtype='object')

In [6]:
seasons = sorted(wl_odds_player_df["season"].unique())

# Training period 1 season

In [7]:
ll_01 = list()

for i in range(1, len(seasons)-1):
    
    train_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin(seasons[i:i+2][:-1])].dropna()
    test_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin([seasons[i:i+2][-1]])].dropna()

    train_dates = sorted(train_df["date"].unique())

    calib_dates = random.sample(train_dates, int(len(train_dates)*0.25))

    calib_df = train_df.loc[train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[~train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[train_df["home"]==1]
    calib_df = calib_df.loc[calib_df["home"]==1]

    test_df = test_df.loc[test_df["home"]==1]

    feat_col = [c for c in train_df.columns if "avg" in c]
    feat_col = feat_col + ["rating_diff"]

    skf = StratifiedKFold(n_splits=6, shuffle=True)

    lgb_param_grid = {'lgb__n_estimators': [100, 200, 300,],
                     'lgb__num_leaves':  [2, 4, 6, 10, 15, 20],
                     'lgb__subsample': [0.4, 0.6, 0.8, 1],
                     'lgb__colsample_bytree': [0.4, 0.6, 0.8, 1],
                     'lgb__is_unbalance': [False]}

    lgb_pipe = Pipeline([("lgb", lightgbm.LGBMClassifier())])

    lgb_cv = RandomizedSearchCV(lgb_pipe, 
                                param_distributions=lgb_param_grid, 
                                n_jobs=-1,
                                cv=skf, 
                                     n_iter=50,
                                     refit=True,
                                    )

    lgb_cv.fit(train_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                              "cumulative_lose", "cumulative_win",
                              "season", "date", "stage", "match_id", 'match_num'], axis=1),
                    train_df["result"])


    lgb_calib = CalibratedClassifierCV(lgb_cv, method='sigmoid', cv=5)

    lgb_calib.fit(calib_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                 "cumulative_lose", "cumulative_win",
                                 "season", "date", "stage", "match_id", 'match_num'], axis=1),
                  calib_df["result"])

    lgb_calib_pred = lgb_calib.predict_proba(test_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                                           "cumulative_lose", "cumulative_win",
                                                           "season", "date", "stage", "match_id", 'match_num'], axis=1))

    ll_01.append(log_loss(test_df["result"], lgb_calib_pred))







# Training period 2 season

In [8]:
ll_02 = list()

for i in range(1, len(seasons)-2):
    
    train_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin(seasons[i:i+3][:-1])].dropna()
    test_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin([seasons[i:i+3][-1]])].dropna()

    train_dates = sorted(train_df["date"].unique())

    calib_dates = random.sample(train_dates, int(len(train_dates)*0.25))

    calib_df = train_df.loc[train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[~train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[train_df["home"]==1]
    calib_df = calib_df.loc[calib_df["home"]==1]

    test_df = test_df.loc[test_df["home"]==1]

    feat_col = [c for c in train_df.columns if "avg" in c]
    feat_col = feat_col + ["rating_diff"]


    skf = StratifiedKFold(n_splits=6, shuffle=True)

    lgb_param_grid = {'lgb__n_estimators': [100, 200, 300,],
                     'lgb__num_leaves':  [2, 4, 6, 10, 15, 20],
                     'lgb__subsample': [0.4, 0.6, 0.8, 1],
                     'lgb__colsample_bytree': [0.4, 0.6, 0.8, 1],
                     'lgb__is_unbalance': [False]}

    lgb_pipe = Pipeline([("lgb", lightgbm.LGBMClassifier())])

    lgb_cv = RandomizedSearchCV(lgb_pipe, 
                                param_distributions=lgb_param_grid, 
                                n_jobs=-1,
                                cv=skf, 
                                     n_iter=50,
                                     refit=True,
                                    )

    lgb_cv.fit(train_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                              "cumulative_lose", "cumulative_win",
                              "season", "date", "stage", "match_id", 'match_num'], axis=1),
                    train_df["result"])


    lgb_calib = CalibratedClassifierCV(lgb_cv, method='sigmoid', cv=5)

    lgb_calib.fit(calib_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                 "cumulative_lose", "cumulative_win",
                                 "season", "date", "stage", "match_id", 'match_num'], axis=1),
                  calib_df["result"])

    lgb_calib_pred = lgb_calib.predict_proba(test_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                                           "cumulative_lose", "cumulative_win",
                                                           "season", "date", "stage", "match_id", 'match_num'], axis=1))

    ll_02.append(log_loss(test_df["result"], lgb_calib_pred))


# Training period 3 seasons

In [9]:
ll_03 = list()

for i in range(1, len(seasons)-3):
    
    train_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin(seasons[i:i+4][:-1])].dropna()
    test_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin([seasons[i:i+4][-1]])].dropna()

    train_dates = sorted(train_df["date"].unique())

    calib_dates = random.sample(train_dates, int(len(train_dates)*0.25))

    calib_df = train_df.loc[train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[~train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[train_df["home"]==1]
    calib_df = calib_df.loc[calib_df["home"]==1]

    test_df = test_df.loc[test_df["home"]==1]

    feat_col = [c for c in train_df.columns if "avg" in c]
    feat_col = feat_col + ["rating_diff"]


    skf = StratifiedKFold(n_splits=6, shuffle=True)

    lgb_param_grid = {'lgb__n_estimators': [100, 200, 300,],
                     'lgb__num_leaves':  [2, 4, 6, 10, 15, 20],
                     'lgb__subsample': [0.4, 0.6, 0.8, 1],
                     'lgb__colsample_bytree': [0.4, 0.6, 0.8, 1],
                     'lgb__is_unbalance': [False]}

    lgb_pipe = Pipeline([("lgb", lightgbm.LGBMClassifier())])

    lgb_cv = RandomizedSearchCV(lgb_pipe, 
                                param_distributions=lgb_param_grid, 
                                n_jobs=-1,
                                cv=skf, 
                                     n_iter=50,
                                     refit=True,
                                    )

    lgb_cv.fit(train_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                              "cumulative_lose", "cumulative_win",
                              "season", "date", "stage", "match_id", 'match_num'], axis=1),
                    train_df["result"])


    lgb_calib = CalibratedClassifierCV(lgb_cv, method='sigmoid', cv=5)

    lgb_calib.fit(calib_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                 "cumulative_lose", "cumulative_win",
                                 "season", "date", "stage", "match_id", 'match_num'], axis=1),
                  calib_df["result"])

    lgb_calib_pred = lgb_calib.predict_proba(test_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                                           "cumulative_lose", "cumulative_win",
                                                           "season", "date", "stage", "match_id", 'match_num'], axis=1))

    ll_03.append(log_loss(test_df["result"], lgb_calib_pred))



In [10]:
np.mean(ll_01), np.mean(ll_02), np.mean(ll_03)

(1.0058695291211024, 0.9984708416646916, 0.9954775308959847)

# Training period 3 seasons with current season

In [12]:
ll_04 = list()

for i in range(1, len(seasons)-3):
    
    train_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin(seasons[i:i+4][:-1])].dropna()
    test_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin([seasons[i:i+4][-1]])].dropna()
    
    test_dates = sorted(test_df["date"].unique())
    n = len(test_dates) // 2

    train_df = pd.concat([train_df, 
                          test_df.loc[test_df["date"].isin(test_dates[:n])]])

    test_df = test_df.loc[test_df["date"].isin(test_dates[n:])]

    train_dates = sorted(train_df["date"].unique())

    calib_dates = random.sample(train_dates, int(len(train_dates)*0.25))

    calib_df = train_df.loc[train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[~train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[train_df["home"]==1]
    calib_df = calib_df.loc[calib_df["home"]==1]

    test_df = test_df.loc[test_df["home"]==1]

    feat_col = [c for c in train_df.columns if "avg" in c]
    feat_col = feat_col + ["rating_diff"]

    skf = StratifiedKFold(n_splits=6, shuffle=True)

    lgb_param_grid = {'lgb__n_estimators': [100, 200, 300,],
                     'lgb__num_leaves':  [2, 4, 6, 10, 15, 20],
                     'lgb__subsample': [0.4, 0.6, 0.8, 1],
                     'lgb__colsample_bytree': [0.4, 0.6, 0.8, 1],
                     'lgb__is_unbalance': [False]}

    lgb_pipe = Pipeline([("lgb", lightgbm.LGBMClassifier())])

    lgb_cv = RandomizedSearchCV(lgb_pipe, 
                                param_distributions=lgb_param_grid, 
                                n_jobs=-1,
                                cv=skf, 
                                     n_iter=50,
                                     refit=True,
                                    )

    lgb_cv.fit(train_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                              "cumulative_lose", "cumulative_win",
                              "season", "date", "stage", "match_id", 'match_num'], axis=1),
                    train_df["result"])


    lgb_calib = CalibratedClassifierCV(lgb_cv, method='sigmoid', cv=5)

    lgb_calib.fit(calib_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                 "cumulative_lose", "cumulative_win",
                                 "season", "date", "stage", "match_id", 'match_num'], axis=1),
                  calib_df["result"])

    lgb_calib_pred = lgb_calib.predict_proba(test_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                                           "cumulative_lose", "cumulative_win",
                                                           "season", "date", "stage", "match_id", 'match_num'], axis=1))

    ll_04.append(log_loss(test_df["result"], lgb_calib_pred))


In [13]:
np.mean(ll_04)

1.0301665441044627

In [14]:
ll_05 = list()

for i in range(1, len(seasons)-3):
    
    train_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin(seasons[i:i+4][:-1])].dropna()
    test_df = wl_odds_player_df.loc[wl_odds_player_df["season"].isin([seasons[i:i+4][-1]])].dropna()
    
    test_dates = sorted(test_df["date"].unique())
    n = len(test_dates) // 2

    #train_df = pd.concat([train_df, 
    #                      test_df.loc[test_df["date"].isin(test_dates[:n])]])

    test_df = test_df.loc[test_df["date"].isin(test_dates[n:])]

    train_dates = sorted(train_df["date"].unique())

    calib_dates = random.sample(train_dates, int(len(train_dates)*0.25))

    calib_df = train_df.loc[train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[~train_df["date"].isin(calib_dates)]

    train_df = train_df.loc[train_df["home"]==1]
    calib_df = calib_df.loc[calib_df["home"]==1]

    test_df = test_df.loc[test_df["home"]==1]

    feat_col = [c for c in train_df.columns if "avg" in c]
    feat_col = feat_col + ["rating_diff"]

    skf = StratifiedKFold(n_splits=6, shuffle=True)

    lgb_param_grid = {'lgb__n_estimators': [100, 200, 300,],
                     'lgb__num_leaves':  [2, 4, 6, 10, 15, 20],
                     'lgb__subsample': [0.4, 0.6, 0.8, 1],
                     'lgb__colsample_bytree': [0.4, 0.6, 0.8, 1],
                     'lgb__is_unbalance': [False]}

    lgb_pipe = Pipeline([("lgb", lightgbm.LGBMClassifier())])

    lgb_cv = RandomizedSearchCV(lgb_pipe, 
                                param_distributions=lgb_param_grid, 
                                n_jobs=-1,
                                cv=skf, 
                                     n_iter=50,
                                     refit=True,
                                    )

    lgb_cv.fit(train_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                              "cumulative_lose", "cumulative_win",
                              "season", "date", "stage", "match_id", 'match_num'], axis=1),
                    train_df["result"])


    lgb_calib = CalibratedClassifierCV(lgb_cv, method='sigmoid', cv=5)

    lgb_calib.fit(calib_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                 "cumulative_lose", "cumulative_win",
                                 "season", "date", "stage", "match_id", 'match_num'], axis=1),
                  calib_df["result"])

    lgb_calib_pred = lgb_calib.predict_proba(test_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                                           "cumulative_lose", "cumulative_win",
                                                           "season", "date", "stage", "match_id", 'match_num'], axis=1))

    ll_05.append(log_loss(test_df["result"], lgb_calib_pred))




In [15]:
np.mean(ll_05)

0.978746809994982

In [18]:
y_true = test_df["result"].values
y_pred = lgb_calib.predict(test_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                         "cumulative_lose", "cumulative_win",
                                         "season", "date", "stage", "match_id", 'match_num'], axis=1))


print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.54      0.47      0.50       445
           0       0.00      0.00      0.00       350
           1       0.53      0.86      0.65       648

    accuracy                           0.53      1443
   macro avg       0.36      0.44      0.38      1443
weighted avg       0.40      0.53      0.45      1443



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
y_true = test_df["result"].values
y_pred = lgb_cv.predict(test_df.drop(["result", "win", "lose", "draw", "team_id", "m_rating",
                                      "cumulative_lose", "cumulative_win",
                                      "season", "date", "stage", "match_id", 'match_num'], axis=1))


print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.46      0.64      0.54       445
           0       0.50      0.01      0.02       350
           1       0.57      0.72      0.64       648

    accuracy                           0.52      1443
   macro avg       0.51      0.46      0.40      1443
weighted avg       0.52      0.52      0.46      1443

