In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import joblib
import copy
import pandas as pd
import numpy as np
import os
from time import time
import heapq


PRJ_ROOT_DIR = os.path.dirname(os.path.abspath(''))

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# prepare data 
from sklearn.model_selection import train_test_split, cross_val_predict, RepeatedStratifiedKFold
from sklearn.cluster import OPTICS, DBSCAN, Birch
from skopt import BayesSearchCV
from skopt.space import Real
from sklearn.preprocessing import StandardScaler

# model
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, IsolationForest
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# evaluate
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, confusion_matrix

lda_table = pd.DataFrame(columns = ['name', 'train_acc', 'train_f1', 'train_roc_auc','test_acc', 'test_f1','test_roc_auc'])
pca_table = pd.DataFrame(columns = ['name', 'train_acc', 'train_f1', 'train_roc_auc','test_acc', 'test_f1','test_roc_auc'])


# Utils

In [None]:
def stratify(df, frac, random_state):
    win_19_20 = df[(df['season'] == '2019/20') & (df['result'] == 'win')].sample(frac = frac, random_state = random_state)
    lose_19_20 = df[(df['season'] == '2019/20') & (df['result'] == 'lose')].sample(frac = frac, random_state = random_state)
    draw_19_20 = df[(df['season'] == '2019/20') & (df['result'] == 'draw')].sample(frac = frac, random_state = random_state)
    win_20_21 = df[(df['season'] == '2020/21') & (df['result'] == 'win')].sample(frac = frac, random_state = random_state)
    lose_20_21 = df[(df['season'] == '2020/21') & (df['result'] == 'lose')].sample(frac = frac, random_state = random_state)
    draw_20_21 = df[(df['season'] == '2020/21') & (df['result'] == 'draw')].sample(frac = frac, random_state = random_state)
    win_21_22 = df[(df['season'] == '2021/22') & (df['result'] == 'win')].sample(frac = frac, random_state = random_state)
    lose_21_22 = df[(df['season'] == '2021/22') & (df['result'] == 'lose')].sample(frac = frac, random_state = random_state)
    draw_21_22 = df[(df['season'] == '2021/22') & (df['result'] == 'draw')].sample(frac = frac, random_state = random_state)
    df_test = pd.concat([win_19_20,lose_19_20,draw_19_20,win_20_21,lose_20_21,draw_20_21,win_21_22,lose_21_22,draw_21_22], axis = 0)
    df_train = pd.concat([df,df_test]).drop_duplicates(keep=False)
    return df_train.iloc[:,:-2], df_test.iloc[:,:-2],df_train.iloc[:,-2] , df_test.iloc[:,-2]

def train_classifier(clf, X, y):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X, y)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))
    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    y_pred = cross_val_predict(clf, features, target.squeeze(),cv = 3, method = 'predict_proba')
    y_pred_1 = cross_val_predict(clf, features, target.squeeze(),cv = 3, method = 'predict')
    return f1_score(target.squeeze(), y_pred_1, average='weighted'), accuracy_score(target.squeeze(), y_pred_1), roc_auc_score(target.squeeze(), y_pred, multi_class = 'ovo', average = 'macro')


    
def train_predict(name, clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    global lda_table, pca_table
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1tr, acctr,roctr = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score and roc score for training set: {:.4f} , {:.4f} , {:.4f}.".format(f1tr , acctr, roctr))
    
    f1te, accte, rocte = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score and roc score for test set: {:.4f} , {:.4f} , {:.4f}.".format(f1te , accte, rocte))
    if 'lda' in name:
        lda_table = pd.concat([lda_table,pd.DataFrame([[name, acctr,f1tr,roctr,accte,f1te,rocte]], columns = lda_table.columns)])
    else:
        pca_table = pd.concat([pca_table,pd.DataFrame([[name, acctr,f1tr,roctr,accte,f1te,rocte]], columns = lda_table.columns)])

def tuning(clf,param_dict,X_train, y_train, n_iter=50,cv=5,scoring="roc_auc_ovo",random_state=42,verbose=0):
    search_spaces = {}
    for param in clf.get_params().keys():
        if (str(clf.__class__.__name__) +'_'+param) in param_dict.keys():
            search_spaces[param] = param_dict[str(clf.__class__.__name__) +'_'+param]
    if search_spaces == {}:
        return clf
    search = BayesSearchCV(clf, search_spaces, 
                           n_iter=n_iter, cv=cv, scoring=scoring, 
                           random_state=random_state, n_jobs=-1, verbose=verbose)

    search.fit(X_train, y_train.squeeze())
    
    print(str(clf.__class__.__name__) +" best score :", search.best_score_)
    print(str(clf.__class__.__name__) +" best params:", search.best_params_)

    return search

# Setup to save/load the model
def save_model(model, id_):
    print("Saving model", id_)
    joblib.dump(model, os.path.join(NOTE_ROOT_DIR, "models", id_ + ".pkl"))
def load_model(id_):
    print("Loading model", id_)
    return joblib.load(os.path.join(NOTE_ROOT_DIR, "models", id_ + ".pkl"))


## 1. Load data

### LDA Data

In [None]:
df = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate","lda", "matches.csv"))
para = {'name':'lda', 'eps': 8.5}
df

### PCA Data

In [None]:
df = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate","pca", "matches.csv"))
para = {'name':'pca', 'eps': 30}
df

### 1.1. Manage Empty Positions' Statistics

In [None]:
for i in df.columns:
    df[i].replace({-100 : min(df[df[i] != -100][i])}, inplace = True)
#     df[i].replace({-100 : 0}, inplace = True)
df


## 2. Scale

In [None]:
df_scale =df[df.columns[2:]]
scaler = StandardScaler()
df_scale=scaler.fit_transform(df[df.columns[2:]])
df_scale = pd.DataFrame(df_scale, columns = df.columns[2:])
df_scale[['result','season'] ] = df[['home_result', 'season']]
df_scale


## 2.Trainset & Testset

In [None]:
# df = df_scale
# X_train, X_test, y_train, y_test = stratify(df, 0.2,42)

In [None]:
df = df_scale 
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[:-2]], df['result'], test_size = 0.2, random_state = 42)
y_train = y_train.replace({'lose':0, 'draw': 1, 'win': 2})
y_test = y_test.replace({'lose':0, 'draw': 1, 'win': 2})

In [None]:
print('Train size :',len(X_train))
print('Test size: ', len(X_test))

In [None]:
X_train

## 3. Outline removal

### 3.2. Isolation Forest

In [None]:
model=IsolationForest(n_estimators=50, max_samples='auto', contamination=float(0.1),max_features=1.0)
X_train['anomaly'] = model.fit_predict(X_train)
ano = X_train[X_train['anomaly'] == -1].index
X_train.drop(ano, inplace=True)
X_train.reset_index(inplace = True, drop = True)

y_train.drop(ano, inplace=True)
y_train.reset_index(inplace = True, drop = True)

assert len(X_train) == len(y_train)

X_train = X_train[X_train.columns[:-1]]
X_train

### 3.3. Clustering

In [None]:
X_train['cluster'] = DBSCAN(min_samples=2, eps = para['eps']).fit_predict(X_train)
cls = X_train[X_train['cluster'] == -1].index
X_train.drop(cls, inplace=True)
X_train.reset_index(inplace = True, drop = True)

y_train.drop(cls, inplace=True)
y_train.reset_index(inplace = True, drop = True)

X_train = X_train[X_train.columns[:-1]]
X_train

### 3.4. Boxplot

In [None]:
columns = X_train.columns
upper = []
lower = []
for i in columns :
    
    Q1 = np.percentile(X_train[i], 25,
                       interpolation = 'midpoint')

    Q3 = np.percentile(X_train[i], 75,
                       interpolation = 'midpoint')
    IQR = Q3 - Q1

    # Upper bound
    upper += np.where(X_train[i] >= (Q3+1.5*IQR))[0].tolist()
#     print(upper)
    # Lower bound
    lower += np.where(X_train[i] <= (Q1-1.5*IQR))[0].tolist()

    ''' Removing the Outliers '''
index = upper+ lower
frq = [index.count(i) for i in range(len(X_train))]
# print([frq[i] for i in range(len(frq)) if frq[i] in heapq.nlargest(4, set(frq)) ])
bl = [i for i in range(len(frq)) if frq[i] in heapq.nlargest(2, set(frq)) ]
X_train.drop(bl, inplace = True)
X_train.reset_index(inplace = True, drop = True)
y_train.drop(bl, inplace = True)
y_train.reset_index(inplace = True, drop = True)
X_train

## 6. Model

### 6.2. Attribute selection

In [None]:
importance = np.zeros((1,len(X_train.columns)))
#     model = XGBClassifier()
model = RandomForestClassifier()
param_dict = {  
    "XGBClassifier_eta": (0.01,0.2,"uniform"),
    "XGBClassifier_min_child_weight": (1,20),
    "XGBClassifier_max_depth": (3,10),
    
    "RandomForestClassifier_n_estimators": (5, 500), 
    "RandomForestClassifier_criterion": ["gini", "entropy"],
    "RandomForestClassifier_max_depth": (1, 19), # 19 overfits the data
    "RandomForestClassifier_min_samples_split": (2, 20),
    "RandomForestClassifier_max_features": [None, "sqrt", "log2"],
    "RandomForestClassifier_max_leaf_nodes": (2, 159),
    "RandomForestClassifier_min_impurity_decrease": (1e-6, 0.5, "uniform"),
    "RandomForestClassifier_max_samples": (0.5, 1.0, "uniform")
}
model_t = tuning(model,param_dict,X_train, y_train)
# fit the model
model_t.best_estimator_.fit(X_train, y_train)
# get importance
importance = model_t.best_estimator_.feature_importances_

# plot feature importance
plt.figure(figsize=(30,40))
plt.bar([x for x in range(len(importance))], [x for x in importance])
plt.show()

thes = float(input('Thres: '))
# summarize feature importance
idx = []
for i,v in enumerate(importance):
    if v > thes :
        idx.append(i)
        print('Feature: %0d, Score: %.5f' % (i,v))
print(len(idx),idx)

In [None]:
corr_target = X_train.corrwith(y_train)
corr_other = X_train.corr()
corr_table = corr_target.subtract(corr_other.mean(axis = 1)) 
attribute = corr_table.nlargest(60)
idx = [list(X_train.columns).index(i) for i in attribute.index]

In [None]:
X_train = X_train[X_train.columns[idx]]
X_test = X_test[X_test.columns[idx]]
X_train

### 6.3.Model

#### 6.3.1. Logistic Regression

In [None]:
lr = LogisticRegression(random_state = 42,multi_class="multinomial")
train_predict(para['name']+'_lr_no_tune',lr, X_train, y_train, X_test, y_test)

#### 6.3.4. Decision Tree

In [None]:
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
train_predict(para['name']+'_dt_no_tune',dt, X_train, y_train, X_test, y_test)

#### 6.3.5. Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
train_predict(para['name']+'_rf_no_tune',rf, X_train, y_train, X_test, y_test)

#### 6.3.6. Ada Boost

In [None]:
ab = AdaBoostClassifier(learning_rate=0.7, n_estimators=100)
train_predict(para['name']+'_ab_no_tune',ab, X_train, y_train, X_test, y_test)

#### 6.3.7. Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(learning_rate=0.7, random_state=42)
train_predict(para['name']+'_gb_no_tune',gb, X_train, y_train, X_test, y_test)

#### 6.3.8. Support Vector Classification

In [None]:
svc = SVC(probability=True, random_state=42)
train_predict(para['name']+'_svc_no_tune',svc, X_train, y_train, X_test, y_test)

#### 6.3.9. Simple Neural Network

In [None]:
nn = MLPClassifier(random_state=42)
train_predict(para['name']+'_nn_no_tune',nn, X_train, y_train, X_test, y_test)

### 6.4. Hyperparameter tuning

##### 6.4.0. Parameter

In [None]:
param_dict = {  
    "RandomForestClassifier_n_estimators": (5, 500), 
    "RandomForestClassifier_criterion": ["gini", "entropy"],
    "RandomForestClassifier_max_depth": (1, 19), # 19 overfits the data
    "RandomForestClassifier_min_samples_split": (2, 20),
    "RandomForestClassifier_max_features": [None, "sqrt", "log2"],
    "RandomForestClassifier_max_leaf_nodes": (2, 159),
    "RandomForestClassifier_min_impurity_decrease": (1e-6, 0.5, "uniform"),
    "RandomForestClassifier_max_samples": (0.5, 1.0, "uniform"),
        
    "GradientBoostingClassifier_n_estimators": (2, 100),
    "GradientBoostingClassifier_learning_rate": Real(low=0.001, high=3, prior="uniform"),
    "GradientBoostingClassifier_subsample": Real(low=0.05, high=1.0, prior="uniform"),
    "GradientBoostingClassifier_criterion": ["friedman_mse", "squared_error"],
    "GradientBoostingClassifier_min_samples_split": Real(low=1e-6, high=1.0, prior="uniform"),
    "GradientBoostingClassifier_max_depth": (1, 10),
    "GradientBoostingClassifier_min_impurity_decrease": Real(low=1e-6, high=0.5, prior="uniform"),
    "GradientBoostingClassifier_max_features": [None, "sqrt", "log2"],
    "GradientBoostingClassifier_max_leaf_nodes": (2, 100),
                  
    "AdaBoostClassifier_n_estimators": (2, 500),
    "AdaBoostClassifier_learning_rate": Real(low=0.001, high=3,  prior='uniform'),
                  
    "SVC_C": Real(low=1e-6, high=2, prior="uniform"),
    "SVC_kernel": ["linear", "poly", "rbf", "sigmoid"],
    "SVC_degree": (2, 30),
    "SVC_gamma": ["scale", "auto"]
    }

In [None]:
load = False

#### 6.4.1. Logistic Regression

In [None]:
# load = False
if load =  True:
    load_model("lr_t")
else:
    lr_t = tuning(lr,param_dict,X_train, y_train)
    save_model(lr_t,"lr_t")
train_predict(para['name']+'_lr_tune',lr_t, X_train, y_train, X_test, y_test)

#### 6.4.4. Decision Tree

In [None]:
# load = False
if load =  True:
    load_model("dt_t")
else:
    dt_t = tuning(dt,param_dict,X_train, y_train)
    save_model(dt_t,"dt_t")
train_predict(para['name']+'_dt_tune',dt_t, X_train, y_train, X_test, y_test)

#### 6.4.5. Random Forest

In [None]:
# load = False
if load =  True:
    load_model("rf_t")
else:
    rf_t = tuning(rf,param_dict,X_train, y_train).best_estimator_
    save_model(rf_t,"rf_t")
train_predict(para['name']+'_rf_tune',rf_t, X_train, y_train, X_test, y_test)

#### 6.4.6. Ada Boost

In [None]:
# load = False
if load =  True:
    load_model("ab_t")
else:
    ab_t = tuning(ab,param_dict,X_train, y_train).best_estimator_
    save_model(ab_t,"ab_t")
train_predict(para['name']+'_ab_tune',ab_t, X_train, y_train, X_test, y_test)

#### 6.4.7. Gradient Boosting

In [None]:
# load = False
if load =  True:
    load_model("gb_t")
else:
    gb_t = tuning(gb,param_dict,X_train, y_train).best_estimator_
    save_model(gb_t,"gb_t")
train_predict(para['name']+'_gb_tune',gb_t, X_train, y_train, X_test, y_test)

#### 6.4.8. Support Vector Classification

In [None]:
# load = False
if load =  True:
    load_model("svc_t")
else:
    svc_t = tuning(svc,param_dict,X_train, y_train).best_estimator_
    save_model(svc_t,"svc_t")
train_predict(para['name']+'_svc_tune',svc_t, X_train, y_train, X_test, y_test)

#### 6.4.9. Simple Neural Network

In [None]:
# load = False
if load =  True:
    load_model("nn_t")
else:
    nn_t = tuning(nn,param_dict,X_train, y_train)
    save_model(nn_t,"nn_t")
train_predict(para['name']+'_nn_tune',nn_t, X_train, y_train, X_test, y_test)

## 7. Summary

### LDA

In [None]:
lda_table

In [None]:
print(classification_report(y_test, rf.predict(X_test), target_names=["lose","draw","win"]))

In [None]:

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
conf_mx = confusion_matrix(y, rf.predict(X))
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)

### PCA

In [None]:
pca_table

In [None]:
print(classification_report(y_test, svc.predict(X_test), target_names=["draw", "lose", "win"]))

In [None]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
conf_mx = confusion_matrix(y, svc.predict(X))
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)