In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split # train_test
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression,PassiveAggressiveClassifier,RidgeClassifier,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import LinearSVC, SVC,NuSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from time import perf_counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.metrics import log_loss

from sklearn.ensemble import BaggingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from IPython.display import Markdown, display

def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))

import warnings
warnings.filterwarnings(action='ignore')

# 1. Data Description & Visualization

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
sub_file = pd.read_csv("submission.csv")

In [3]:
train.head(2)

Unnamed: 0,season,date,league_id,league,Team 1,Team2,SPI1,SPI2,proj_score1,proj_score2,...,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2,Outcome
0,2019,01/03/19,1979,Chinese Super League,Shandong Luneng,Guizhou Renhe,48.22,37.83,1.75,0.84,...,22.1,1.0,0.0,1.39,0.26,2.05,0.54,1.05,0.0,1
1,2019,01/03/19,1979,Chinese Super League,Shanghai Greenland,Shanghai SIPG,39.81,60.08,1.22,1.89,...,63.4,0.0,4.0,0.57,2.76,0.8,1.5,0.0,3.26,0


In [4]:
test.head(2)

Unnamed: 0,season,date,league_id,league,Team 1,Team2,SPI1,SPI2,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2021,14/12/21,2411,BPL,Arsenal,West Ham United,79.65,74.06,1.67,1.19,,,,,,,,,,
1,2021,14/12/21,2411,BPL,Brighton and Hove Albion,Wolverhampton,74.19,71.14,1.35,0.98,,,,,,,,,,


In [5]:
train.shape, test.shape, sub_file.shape

((7443, 21), (4008, 20), (4008, 1))

In [6]:
train.Outcome.value_counts()

1    5070
0    2373
Name: Outcome, dtype: int64

In [7]:
train.nunique()

season            3
date            424
league_id        39
league           39
Team 1          657
Team2           657
SPI1           2516
SPI2           2514
proj_score1     248
proj_score2     242
importance1     701
importance2     700
score1            8
score2            7
xg1             333
xg2             305
nsxg1           285
nsxg2           253
adj_score1      149
adj_score2      104
Outcome           2
dtype: int64

In [8]:
test.nunique()

season           1
date           115
league_id       24
league          24
Team 1         426
Team2          426
SPI1           412
SPI2           412
proj_score1    218
proj_score2    215
importance1      0
importance2      0
score1           0
score2           0
xg1              0
xg2              0
nsxg1            0
nsxg2            0
adj_score1       0
adj_score2       0
dtype: int64

In [9]:
train.columns

Index(['season', 'date', 'league_id', 'league', 'Team 1', 'Team2', 'SPI1',
       'SPI2', 'proj_score1', 'proj_score2', 'importance1', 'importance2',
       'score1', 'score2', 'xg1', 'xg2', 'nsxg1', 'nsxg2', 'adj_score1',
       'adj_score2', 'Outcome'],
      dtype='object')

In [10]:
train.rename(columns={'Team 1':'Team1'},inplace=True)
test.rename(columns={'Team 1':'Team1'},inplace=True)


## Features

In [11]:
features = list(set(train.columns)-set(['season','importance1', 'importance2',
       'score1', 'score2', 'xg1', 'xg2', 'nsxg1', 'nsxg2', 'adj_score1',
       'adj_score2', 'Outcome','date','league']))
target = 'Outcome'
len(features)

7

In [12]:
features

['Team1', 'SPI1', 'proj_score1', 'SPI2', 'proj_score2', 'Team2', 'league_id']

In [13]:
train[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7443 entries, 0 to 7442
Data columns (total 7 columns):
Team1          7443 non-null object
SPI1           7443 non-null float64
proj_score1    7443 non-null float64
SPI2           7443 non-null float64
proj_score2    7443 non-null float64
Team2          7443 non-null object
league_id      7443 non-null int64
dtypes: float64(4), int64(1), object(2)
memory usage: 407.2+ KB


In [14]:
from sklearn.preprocessing import LabelEncoder
for col in features:
    train.loc[:, col] = train[col].astype(str)
    
    lbl = LabelEncoder()
    lbl.fit(train[col])

    train.loc[:, col] = lbl.transform(train[col])
    


In [15]:
train.head()

Unnamed: 0,season,date,league_id,league,Team1,Team2,SPI1,SPI2,proj_score1,proj_score2,...,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2,Outcome
0,2019,01/03/19,26,Chinese Super League,531,262,1587,1104,125,52,...,22.1,1.0,0.0,1.39,0.26,2.05,0.54,1.05,0.0,1
1,2019,01/03/19,26,Chinese Super League,532,533,1229,2029,72,157,...,63.4,0.0,4.0,0.57,2.76,0.8,1.5,0.0,3.26,0
2,2019,01/03/19,26,Chinese Super League,258,589,2187,1231,208,30,...,28.8,3.0,0.0,0.49,0.45,1.05,0.75,3.15,0.0,1
3,2019,01/03/19,26,Chinese Super League,649,73,779,1850,60,147,...,58.9,0.0,1.0,1.12,0.97,1.51,0.94,0.0,1.05,0
4,2019,01/03/19,26,Chinese Super League,138,259,1125,1262,107,92,...,21.3,2.0,2.0,2.77,3.17,1.05,2.08,2.1,2.1,1


In [16]:
train[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7443 entries, 0 to 7442
Data columns (total 7 columns):
Team1          7443 non-null int32
SPI1           7443 non-null int32
proj_score1    7443 non-null int32
SPI2           7443 non-null int32
proj_score2    7443 non-null int32
Team2          7443 non-null int32
league_id      7443 non-null int32
dtypes: int32(7)
memory usage: 203.6 KB


In [17]:
for col in features:
    test.loc[:, col] = test[col].astype(str)
    
    lbl = LabelEncoder()
    lbl.fit(test[col])

    test.loc[:, col] = lbl.transform(test[col])

In [18]:
test[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4008 entries, 0 to 4007
Data columns (total 7 columns):
Team1          4008 non-null int32
SPI1           4008 non-null int32
proj_score1    4008 non-null int32
SPI2           4008 non-null int32
proj_score2    4008 non-null int32
Team2          4008 non-null int32
league_id      4008 non-null int32
dtypes: int32(7)
memory usage: 109.7 KB


## Cross Validation

In [19]:
def metric(a, b):
    return log_loss(a, b)

def cross_valid(model, train, features, target, cv=3):
    results = cross_val_predict(model, train[features], train[target], method="predict_proba", cv=cv)
    return metric(train[target], results)

In [20]:
models = [
    LGBMClassifier(), 
    XGBClassifier(verbosity = 0),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    LogisticRegression(),   
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    MLPClassifier(),
    GaussianNB(),
    BaggingClassifier()    
         ]
for i in models:
    model = i
    error = cross_valid(model,train,features,target,cv=10)
    print(str(model).split("(")[0], error)

LGBMClassifier 0.03199033031773276
XGBClassifier 0.031056475965232595
GradientBoostingClassifier 0.044421901030699935
RandomForestClassifier 0.06634931313868325
AdaBoostClassifier 0.5633901302602827
LogisticRegression 0.038222845541017375
KNeighborsClassifier 0.45571312329322666
DecisionTreeClassifier 0.5336503137733084
ExtraTreeClassifier 2.4594318808682876
MLPClassifier 0.12819326031581643
GaussianNB 0.1353264368609654
BaggingClassifier 0.06397249266750554


In [21]:
trainX, validX, trainY, validY = train_test_split(train[features],
                                                  train[target],
                                                  test_size=0.2,
                                                  stratify=train[target],
                                                  random_state=13
                                                 )

## Prediction on test dataset

In [22]:
model = XGBClassifier(learning_rate=0.01,
                       n_estimators=5000,
                       max_depth=8,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42
                     )
model.fit(trainX[features], trainY)
y_pred_valid = model.predict_proba(validX[features])
print("Validation Score: ",metric(validY, y_pred_valid))


Validation Score:  0.0783717048659871


In [23]:
y_pred_test = model.predict_proba(test[features])[:, 1]
result = pd.DataFrame({"Outcome":y_pred_test})
#result.to_csv("submission_xgb.csv", index=False)
result.head()

Unnamed: 0,Outcome
0,1.004207
1,0.95032
2,0.13833
3,1.004141
4,0.077395


In [28]:
result.to_csv("submission_xgb.csv", index=False)
