In [33]:
#imports

import os
import sys
import pandas as pd
import numpy as np
import time
import seaborn as sns
from datetime import datetime as dt
from datetime import timedelta
import datetime
import re
import joblib
from urllib.error import HTTPError

import eli5
from eli5.sklearn import PermutationImportance
from matplotlib.colors import ListedColormap

from sklearn.experimental import enable_halving_search_cv # noqa
import warnings
from sklearn import tree
import statsmodels.formula.api as smf
from statsmodels.miscmodels.ordinal_model import OrderedModel
import sklearn
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostRegressor
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, validation_curve, learning_curve, HalvingGridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor, ElasticNet, BayesianRidge, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_score, accuracy_score, recall_score
from sklearn.preprocessing import MinMaxScaler, scale, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
from sklearn.feature_selection import RFE

from scipy.stats import poisson
from matplotlib import pyplot as plt

from bs4 import BeautifulSoup

import requests
import itertools as it

In [34]:
def result(score):
    score = re.split('[– -]',score)
    if any(c.isalpha() for c in score[1]):
        score[1] = str(datetime.datetime.strptime(score[1], '%b').month)
        score[0] = score[0].strip('0')
    gd = int(score[0]) - int(score[1])
    
    if gd < 0:
        return "Away"
    if gd > 0:
        return "Home"
    if gd == 0:
        return "Draw"
    return

In [35]:
def prep(df):
    features = df.drop(['results', 'date', 'home_shot', 'away_shot', 'home_sot', 'away_sot', 'home_red', 'away_red',
                        'home_yellow', 'away_yellow', 'home_goals', 'away_goals', 'ht_home_goals', 'ht_away_goals',
                        'home_corner', 'away_corner', 'ht_result', 'home_foul', 'away_foul', 'home', 'away'], axis = 1)
    y = df['results'].replace(['Home', 'Draw', 'Away'], range(3)).astype(int)

    return features, y

In [36]:
#optimisation

def optimise(clf, params, X, y, name, metric, rfe, league):
    start = time.perf_counter()
    if rfe:
        params = {'estimator__'+ k:v for k, v in params.items()}
        selector = RFE(clf, step = 5)
        grid = GridSearchCV(selector, params, scoring = metric, n_jobs = -1, cv = 2)
        
    else:
        grid = GridSearchCV(clf, params, scoring = metric, n_jobs = -1, cv = 2)
    grid.fit(X, y)
    end = time.perf_counter()
    t = (end - start) / 60
    joblib.dump(grid, 'Models/' + league + '_' + name + '.pk1')
    return grid.best_score_, grid, t

def grid(X_input, y, clf, scoring, rfe, league):
    best = []
    for i in clf:
        score, best_clf, t = optimise(i[0], i[1], X_input, y, i[2], scoring, rfe, league)
        print(i[2] + ' _____________ ' + str(score) + ' took ' + str(t))
        print(best_clf.best_estimator_)
        if score <= 0:
            score = score + 1
        best.append([best_clf, score])
    return best

In [37]:
#history prediction
def histData(league):
    hist = pd.read_csv(league + '.csv', encoding = "ANSI")
    hist = hist[['Home Team', 'Away Team', 'Score']]
    hist = hist.astype({'Score': 'string'})

    #Score
    hist['FTR'] = hist['Score'].apply(result).astype('string')

    hist.columns = ['Home', 'Away', 'Score', 'FTR']
    return hist

def replaceName(name):
    names = {
        'Hellas Verona': 'HELLAS',
        
        
        'Leeds United': 'LEEDS UTD',
        'Leicester City': 'LEICESTER CITY',
        'Manchester City': 'MAN CITY',
        'Manchester Utd': 'MAN UTD',
        'Newcastle Utd': 'NEWCASTLE',
        'Norwich City': 'NORWICH',
        
        'Paris S-G': 'PSG',
        'Lens': 'RC LENS',
        'Metz': 'FC METZ',
        'Reims': 'STADE DE REIMS',
        'Rennes': 'STADE RENNAIS',
        'Saint-Étienne': 'SAINT-Ã‰TIENNE',
        
        'Bayern Munich': 'BAYERN',
        'Eint Frankfurt': 'FRANKFURT',
        'Hertha BSC': 'HERTHA',
        'Köln': '1. FC KÃ–LN',
        'Mainz 05': 'MAINZ',
        'Arminia': 'BIELEFELD', 
        
        'Alavés': 'ALAVÃ‰S',
        'Athletic Club': 'ATHLETIC',
        'Atlético Madrid': 'ATLETICO MADRID',
        'Betis': 'REAL BETIS',
        'Celta Vigo': 'CELTA',
        'Cádiz': 'CÃ\x81DIZ CF',
        'Sevilla': 'SEVILLA FC'      
    }
    if name in names.keys():
        return names[name]
    else:
        return name.upper()

def histPredict(data):    
    predictions = []
    for i in data.index:
        pred = predict(replaceName(data.iloc[i]['home']), replaceName(data.iloc[i]['away']))
        predictions.append(pred)
        #print(type(pred), pred)
    score = accuracy_score(data['results'], predictions)
    return score

def predict(team1, team2):
    h2h = hist.query('Home == @team1 & Away == @team2').tail(1)
    if len(h2h) == 0:
        return 0
    result = h2h.FTR.iloc[0]
    if result == 'Home':
        return 0
    if result == 'Draw':
        return 1
    if result == 'Away':
        return 2

In [38]:
#poisson predection
def poissonPred(team1, team2, endDate, xg, df):
    df3 = df.query('date <= @endDate')
    
    games = len(df3)
    
    homeGames = df3.query('home == @team1')
    awayGames = df3.query('away == @team2')
    
    if xg == False: 
        homeGoals = df3['home_goals'].sum()
        awayGoals = df3['away_goals'].sum()

        team1Goals = homeGames['home_goals'].sum()
        team2Goals = awayGames['away_goals'].sum()
        
        team1GoalsA = homeGames['away_goals'].sum()
        team2GoalsA = awayGames['home_goals'].sum()
    
    else: 
        homeGoals = df3['home_xg'].sum()
        awayGoals = df3['away_xg'].sum()

        team1Goals = homeGames['home_xg'].sum()
        team2Goals = awayGames['away_xg'].sum()
        
        team1GoalsA = homeGames['away_xg'].sum()
        team2GoalsA = awayGames['home_xg'].sum()

    team1Games = len(homeGames)
    team2Games = len(awayGames)    
    
    avgHomeGoals = homeGoals / games
    avgAwayGoals = awayGoals / games

            
    team1AttStr = (team1Goals/team1Games)/avgHomeGoals
    team2AttStr = (team2Goals/team2Games)/avgAwayGoals
    team1DefStr = (team1GoalsA/team1Games)/avgAwayGoals
    team2DefStr = (team2GoalsA/team2Games)/avgHomeGoals
    team1ExpGoal = team1AttStr * team2DefStr * avgHomeGoals
    team2ExpGoal = team2AttStr * team1DefStr * avgAwayGoals
    
    team1Table = []
    score = 5
    for i in range(score):
        team1Table.append(poisson.pmf(i, team1ExpGoal))
    
    team2Table = []
    for i in range(score):
        team2Table.append(poisson.pmf(i, team2ExpGoal))
    
    probs = np.array([team1Table]).T.dot(np.array([team2Table]))
    home = 0
    draw = 0
    away = 0
    for i in range(score):
        for j in range(score):
            if i > j:
                home += probs[i][j]
            if i == j:
                draw += probs[i][j]
            if i < j:
                away += probs[i][j]

    if np.isnan(home):
        home = 0
    if np.isnan(away):
        away = 0
    if np.isnan(draw):
        draw = 0
    return [home, draw, away]

def poiAcc(data, xg):
    predictions = []
    for i in data.index:
        probs = poissonPred(data.iloc[i]['home'], data.iloc[i]['away'], data.iloc[-1]['date'], xg, data)
        pred = np.array(probs).argmax()
        predictions.append(pred)
    score = accuracy_score(data['results'], predictions)
    return score

In [39]:
def roundPred(num):
    if abs(num - 1) < 0.33:
        return 1
    if num >= 1:
        return 2
    return 0

In [40]:
def importance(model):
    resultGrid = joblib.load(j + model + '.pk1')
    perm = PermutationImportance(resultGrid.best_estimator_, random_state=1).fit(X_train, y_train)
    
    shap.initjs()
    explainer = shap.TreeExplainer(resultGrid.best_estimator_.fit(X_train, y_train))
    shap_values = explainer.shap_values(X_train)
    shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])
    shap.summary_plot(shap_values, X_train, plot_type="bar")
    return eli5.show_weights(perm, feature_names = dummies(X).columns.tolist())

In [41]:
#plot
def plot_validation_curve(est, X, y, p_name, p_range, title):
    train_scores, test_scores = validation_curve(estimator=est,
                                             X=X, y=y,
                                             cv=5,
    param_name=p_name, param_range=p_range)
    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)

    plt.plot(p_range, train_mean,
             color='blue', label='Training Accuracy')
    plt.plot(p_range, test_mean,
             color='green', label='Validation Accuracy')
    plt.xlabel(p_name)
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.grid()
    plt.show()

def plot_decision_boundaries(X, y, model, title):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    colors = {0.0:'blue', 1.0:'darkcyan', 2.0:'green'}  
    print(Z.shape)
    print(Z.ravel().shape)
    print(X[:, 0].shape)
    print(X[:, 1].shape)
    
    colored_labels = np.array([colors[xi] for xi in y]) #Z.ravel()
    plt.contourf(xx, yy, Z, cmap='viridis')


    cb = plt.colorbar()
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title(title)
    plt.scatter(X[:, 0], X[:, 1], c=colored_labels, cmap='viridis', s=10)

    plt.show()

In [42]:
#models
def classifiers(feat):
    
    random = [RandomForestClassifier(max_features = None), { 
            'n_estimators': range(1, 32, 10),
            'max_depth' : range(2, 5),
            'criterion' :['gini', 'entropy']}, 'random']
    
    decision = [DecisionTreeClassifier(), {
            'criterion' :['gini', 'entropy']}, 'decision']
    
    logis = [LogisticRegression(max_iter = 1000, penalty = 'l2'), {
            'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'C' : [0.01,0.1, 1]}, 'logistic']

    knn = [KNeighborsClassifier(), {
            'n_neighbors': range(15, 40, 4),
            'weights': ['uniform', 'distance']}, 'knn']

    svc_linear = [svm.SVC(kernel = 'linear', tol = 0.01), {
            }, 'svc_linear']
    
    svc_sigmoid = [svm.SVC(kernel = 'sigmoid'), {
            'C': np.logspace(-2, 1, 5),
            'gamma': np.logspace(-4, 1, 5)}, 'svc_sigmoid']
    
    svc = [svm.SVC(), {
            'C': np.logspace(-1, 2, 5),
            'gamma': np.logspace(-4, 0, 5)}, 'svc']

    xg = [xgb.XGBClassifier(eval_metric = 'merror', use_label_encoder=False), {
            'min_child_weight': [2, 4],
            'max_depth': [1, 2, 3],
            'n_estimators': range(50, 251, 50),}, 'xgboost']
    
    bayes = [GaussianNB(), {
            'var_smoothing': np.logspace(-2 ,1, 3)}, 'bayes']
    
    MLP = [MLPClassifier(max_iter = 10000, tol = 0.001), {
            'solver': ['sgd', 'adam'],
            'alpha': [1, 2, 3],
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
            'activation': ["relu", "tanh"],
            'hidden_layer_sizes':[(80, 80, 80, 80)]}, 'mlp']
    
    ada = [AdaBoostClassifier(algorithm = 'SAMME.R'), {
            'n_estimators': range(50, 251, 50),
            'learning_rate': [0.1, 0.5, 1]}, 'ada']
    
    gbc = [GradientBoostingClassifier(tol = 0.001), {
            'n_estimators': [50, 100, 150, 200],
            'learning_rate': [0.1],
            'max_depth': [3, 4]}, 'gbc']
    
    if feat:
        clfs = [random, decision, logis, xg, ada, gbc]
        #clfs = []
    else:
        clfs = [knn, svc_sigmoid, svc, bayes, MLP]
        #clfs = []

    return clfs

def regressors(feat):
    
    linear = [LinearRegression(), {}, 'linear']

    knn_R = [KNeighborsRegressor(), {
            'n_neighbors': range(5, 30, 3),
            'weights': ['uniform', 'distance']}, 'knn_R']

    svc_linear_R = [svm.SVR(kernel = 'linear', tol=0.01), {
            }, 'svc_linear_R']
    
    svc_sigmoid_R = [svm.SVR(kernel = 'sigmoid'), {
            'C': np.logspace(-3, 1, 6),
            'gamma': ['scale', 'auto']}, 'svc_sigmoid_R']
    
    svc_R = [svm.SVR(), {
            'C': np.logspace(-2, 1, 5),
            'gamma': ['scale', 'auto']}, 'svc_R']

    xg_R = [xgb.XGBRegressor(eval_metric = 'merror', use_label_encoder=False), {
            'learning_rate': [0.03, 0.05],
            'max_depth': [1, 2],
            'min_child_weight': [3, 4],
            'n_estimators': range(50, 251, 50)}, 'xgboost_R']

    MLP_R = [MLPRegressor(max_iter = 10000, tol = 0.001), {
            'solver': ['lbfgs', 'adam'],
            'alpha': [2, 4, 6],
            'learning_rate': ['constant', 'adaptive'],
            'activation': ['tanh'],
            'hidden_layer_sizes':[(80, 80, 80, 80)]}, 'mlp_R']
    
    ada_R = [AdaBoostRegressor(loss='square'), {
            'n_estimators': range(50, 251, 50),
            'learning_rate': np.logspace(-1, 0.5, 5)}, 'ada_R']
    
    gbc_R = [GradientBoostingRegressor(tol = 0.001), {
            'n_estimators': [50, 100, 150, 200, 250],
            'learning_rate': [0.1, 0,2],
            'max_depth': [3, 4]}, 'gbc_R']
    
    if feat:
        regs = [linear, xg_R, ada_R, gbc_R]
        #regs = []
    else:
        regs = [knn_R, svc_R, svc_sigmoid_R, MLP_R]
        #regs = [svc_R, svc_sigmoid_R]
    return regs

In [43]:
def localProcessed(league):
        df = pd.read_csv('local_' + league + '_processed.csv')
        df = df.drop(['Unnamed: 0'], axis = 1)
        df = df.reset_index(drop = True)
        return df

In [44]:
#load data

leagues = {
    'prem':[],
    'ligue':[],
    'bundes':[],
    'serie':[],
    'laliga':[],
    'combined': []}

for i in leagues:
    df = localProcessed(i)
    df['results'] = df['results'].replace(['H', 'D', 'A'], range(3))
    df['ht_result'] = df['ht_result'].replace(['H', 'D', 'A'], range(3))
    leagues[i].append(df)

In [45]:
#combined history data
hist = pd.DataFrame()
for i in list(leagues.keys())[0:-1]:
    hist = pd.concat([hist, histData(i)])
hist.to_csv('local_hist.csv')
hist.columns = ['Home', 'Away', 'Score', 'FTR']

In [46]:
#ml model
#commented line have a runtime of a few hours
for i in leagues:
    
    start = time.perf_counter()
    
    df = leagues[i][0]
    #print(i)
    
    #df['avg_pos'] = df['avg_pos_home'] / df['avg_pos_away']
    #df['avg_pos_diff'] = df['avg_pos_home'] - df['avg_pos_away']
    #df['5_form'] = df['home_5_form'] - df['away_5_form']
    #df['3_form'] = df['home_3_form'] - df['away_3_form']
    #df['1_form'] = df['home_1_form'] - df['away_1_form']
    
    X, y = prep(df)
    
    
    pca = PCA(0.9)
    scaler = StandardScaler()
    pipe = Pipeline([('scaler', scaler), ('pca', pca)])
    X_pca_scaled = pipe.fit_transform(X)
    leagues[i].append(pipe)
    
    f1 = make_scorer(f1_score, average = 'macro')
    
    clf = classifiers(False)
    
    clf_rfe = classifiers(True)
    
    reg = regressors(False)
    
    reg_rfe = regressors(True)
       
    clf_ret = grid(X_pca_scaled, y, clf, 'accuracy', False, i)
    
    clf_rfe_ret = grid(X, y, clf_rfe, 'accuracy', True, i)
    
    reg_ret = grid(X_pca_scaled, y, reg, 'neg_mean_absolute_error', False, i)
    
    reg_rfe_ret = grid(X, y, reg_rfe, 'neg_mean_absolute_error', True, i)
        
    end = time.perf_counter()
    
    #print((end - start)/60)

knn _____________ 0.5133547008547008 took 0.0748428649999975
KNeighborsClassifier(n_neighbors=23, weights='distance')
svc_sigmoid _____________ 0.5491452991452992 took 0.017363033333322165
SVC(C=1.7782794100389228, gamma=0.0017782794100389228, kernel='sigmoid')
svc _____________ 0.5496794871794872 took 0.03266537166667452
SVC(C=17.78279410038923, gamma=0.0001)
bayes _____________ 0.5256410256410257 took 0.0002886916666587543
GaussianNB(var_smoothing=0.31622776601683794)
mlp _____________ 0.530982905982906 took 2.5574334799999936
MLPClassifier(activation='tanh', alpha=1, hidden_layer_sizes=(80, 80, 80, 80),
              max_iter=10000, solver='sgd', tol=0.001)
random _____________ 0.5699786324786325 took 0.29943148833332695
RFE(estimator=RandomForestClassifier(max_depth=2, max_features=None,
                                     n_estimators=21),
    step=5)
decision _____________ 0.43696581196581197 took 0.02441454166666214
RFE(estimator=DecisionTreeClassifier(), step=5)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic _____________ 0.5592948717948718 took 0.688886191666658
RFE(estimator=LogisticRegression(C=0.01, max_iter=1000), step=5)
xgboost _____________ 0.5496794871794872 took 3.103102113333322
RFE(estimator=XGBClassifier(base_score=None, booster=None,
                            colsample_bylevel=None, colsample_bynode=None,
                            colsample_bytree=None, enable_categorical=False,
                            eval_metric='merror', gamma=None, gpu_id=None,
                            importance_type=None, interaction_constraints=None,
                            learning_rate=None, max_delta_step=None,
                            max_depth=1, min_child_weight=4, missing=nan,
                            monotone_constraints=None, n_estimators=50,
                            n_jobs=None, num_parallel_tree=None, predictor=None,
                            random_state=None, reg_alpha=None, reg_lambda=None,
                            scale_pos_weight=None, subsample=Non

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\moham\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\moham\anaconda3\lib\site-packages\sklearn\feature_selection\_rfe.py", line 222, in fit
    return self._fit(X, y, **fit_params)
  File "C:\Users\moham\anaconda3\lib\site-packages\sklearn\feature_selection\_rfe.py", line 283, in _fit
    estimator.fit(X[:, features], y, **fit_params)
  File "C:\Users\moham\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 525, in fit
    self._check_param

gbc_R _____________ -0.674432507925652 took 2.2587497083333345
RFE(estimator=GradientBoostingRegressor(n_estimators=50, tol=0.001), step=5)
knn _____________ 0.4878976276614072 took 0.006593416666661748
KNeighborsClassifier(n_neighbors=27, weights='distance')
svc_sigmoid _____________ 0.5132303985650443 took 0.01760565500000363
SVC(C=10.0, gamma=0.0017782794100389228, kernel='sigmoid')
svc _____________ 0.5160431804132591 took 0.0334585249999994
SVC(C=3.1622776601683795, gamma=0.001)
bayes _____________ 0.48676833470140557 took 0.0002697400000063984
GaussianNB(var_smoothing=0.31622776601683794)
mlp _____________ 0.5199820883876002 took 2.788942584999995
MLPClassifier(activation='tanh', alpha=1, hidden_layer_sizes=(80, 80, 80, 80),
              max_iter=10000, solver='sgd', tol=0.001)
random _____________ 0.5227986704364658 took 0.35150180666666225
RFE(estimator=RandomForestClassifier(max_depth=2, max_features=None,
                                     n_estimators=11),
    step=5)
dec



logistic _____________ 0.523926063296142 took 0.8436086549999952
RFE(estimator=LogisticRegression(C=0.01, max_iter=1000, solver='sag'), step=5)
xgboost _____________ 0.510415083249729 took 3.479177138333322
RFE(estimator=XGBClassifier(base_score=None, booster=None,
                            colsample_bylevel=None, colsample_bynode=None,
                            colsample_bytree=None, enable_categorical=False,
                            eval_metric='merror', gamma=None, gpu_id=None,
                            importance_type=None, interaction_constraints=None,
                            learning_rate=None, max_delta_step=None,
                            max_depth=1, min_child_weight=4, missing=nan,
                            monotone_constraints=None, n_estimators=50,
                            n_jobs=None, num_parallel_tree=None, predictor=None,
                            random_state=None, reg_alpha=None, reg_lambda=None,
                            scale_pos_weight=None, 

KeyboardInterrupt: 

In [None]:
rep = 5

def eval_score(models, pca, acc, X, y):
    scores = []
    for j in (models):
        grid = joblib.load('Models/' + i + '_' + j[2] + '.pk1').best_estimator_
        accuracy = []
        for n in range(rep):
            if pca:
                X_pca = pipe.transform(X)
                X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2)
            else:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
            grid.fit(X_train, y_train)
            pred = grid.predict(X_test)
            if acc:
                score = accuracy_score(pred, y_test)
            else:
                score = 1 - sklearn.metrics.mean_absolute_error(pred, y_test)
            accuracy.append(score)
            
        mean = np.round(np.array(accuracy).mean(), decimals=2)
        var = np.round(np.array(accuracy).var(), decimals=5)
        print(j[2], mean, var)
        scores.append((mean, var))
    return np.array(scores)

In [None]:
results = pd.DataFrame()
   
for i in leagues:
    df = leagues[i][0]
    pipe = leagues[i][1]
    df['avg_pos'] = df['avg_pos_home'] / df['avg_pos_away']
    df['avg_pos_diff'] = df['avg_pos_home'] - df['avg_pos_away']
    df['5_form'] = df['home_5_form'] - df['away_5_form']
    df['3_form'] = df['home_3_form'] - df['away_3_form']
    df['1_form'] = df['home_1_form'] - df['away_1_form']
    X, y = prep(df)
    one = eval_score(classifiers(True), False, True, X, y)
    two = eval_score(classifiers(False), True, True, X, y)
    three = eval_score(regressors(True), False, False, X, y)
    four = eval_score(regressors(False), True, False, X, y)
     
    scores = np.vstack((one, two, three, four))
    #scores = np.array(scores)
    print(scores)
    #print(scores.reshape(2,1))
    scores = pd.DataFrame(scores)
    scores['Mean/Std'] = scores[[0,1]].apply(tuple, axis=1)
    scores = pd.DataFrame(scores['Mean/Std'])
    results = pd.concat([results, scores], axis = 1)
display(results)

In [None]:
clfNames = ['Random Forest', 'Decision Tree', 'Logistic Regression', 'XGBoost', 'AdaBoost', 'Gradient Descent', 'KNN', 
            'SVC(sigmoid kernel)', 'SVC(RBF kernel)', 'Naive Bayes', 'MLP'] 
regNames = ['Linear Regression', 'XGBoost', 'AdaBoost', 'Gradient Descent', 'KNN', 'SVM(RBF kernel)',
            'SVM(Sigmoid kernel)', 'MLP']

In [None]:
def eval_score_comb(models, pca, acc, X, y):
    scores = []
    for j in (models):
        grid = joblib.load('Models/' +'combined_' + j[2] + '.pk1').best_estimator_
        accuracy = []
        for n in range(rep):
            if pca:
                X_pca = pipe.transform(X)
                X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2)
            else:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
            grid.fit(X_train, y_train)
            pred = grid.predict(X_test)
            if acc:
                score = accuracy_score(pred, y_test)
            else:
                score = 1 - sklearn.metrics.mean_absolute_error(pred, y_test)
            accuracy.append(score)
            
        mean = np.round(np.array(accuracy).mean(), decimals=2)
        var = np.round(np.array(accuracy).var(), decimals=5)
        print(j[2], mean, var)
        scores.append((mean, var))
    return np.array(scores)

In [None]:
results_comb = pd.DataFrame()

for i in leagues:
    df = leagues[i][0]
    pipe = leagues[i][1]
    df['avg_pos'] = df['avg_pos_home'] / df['avg_pos_away']
    df['avg_pos_diff'] = df['avg_pos_home'] - df['avg_pos_away']
    df['5_form'] = df['home_5_form'] - df['away_5_form']
    df['3_form'] = df['home_3_form'] - df['away_3_form']
    df['1_form'] = df['home_1_form'] - df['away_1_form']
    X, y = prep(df)
    one = eval_score_comb(classifiers(True), False, True, X, y)
    two = eval_score_comb(classifiers(False), True, True, X, y)
    three = eval_score_comb(regressors(True), False, False, X, y)
    four = eval_score_comb(regressors(False), True, False, X, y)
     
    scores = np.vstack((one, two, three, four))
    #scores = np.array(scores)
    print(scores)
    #print(scores.reshape(2,1))
    scores = pd.DataFrame(scores)
    scores['Mean/Std'] = scores[[0,1]].apply(tuple, axis=1)
    scores = pd.DataFrame(scores['Mean/Std'])
    results_comb = pd.concat([results_comb, scores], axis = 1)
display(results_comb)

In [None]:
leagueNames = ['Premier League', 'Ligue 1', 'Bundesliga', 'Serie A', 'La Liga', 'Combined leagues']
results_comb.columns = leagueNames
results_clf = results_comb.iloc[0: len(clfNames)]
results_reg = results_comb.iloc[len(clfNames):]
results_clf['name'] = clfNames
results_reg['name'] = regNames

results_clf = results_clf.set_index('name', drop = True)
results_clf.index.name = None

results_reg = results_reg.set_index('name', drop = True)
results_reg.index.name = None

display(results_clf)

In [None]:
#other methods
data = leagues['combined'][0]

print(histPredict(data))

In [None]:
print(poiAcc(data, False)) 

In [None]:
df = data.copy()
pipe = leagues['combined'][1]
df['avg_pos'] = df['avg_pos_home'] / df['avg_pos_away']
df['avg_pos_diff'] = df['avg_pos_home'] - df['avg_pos_away']
df['5_form'] = df['home_5_form'] - df['away_5_form']
df['3_form'] = df['home_3_form'] - df['away_3_form']
df['1_form'] = df['home_1_form'] - df['away_1_form']
X, y = prep(df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
grid = joblib.load('Models/combined_linear.pk1')

In [None]:
def plot_learning_curve(n, est, xs, ys, title):
    train_sizes, train_scores, valid_scores = learning_curve(estimator = est, X = xs, y = ys, cv = n, scoring=scorer)

    train_mean = np.mean(train_scores, axis=1)
    valid_mean = np.mean(valid_scores, axis=1)

    plt.plot(train_sizes, train_mean, color='blue', label='Training score')
    plt.plot(train_sizes, valid_mean, color='green', label='Validation score')

    plt.xlabel('Dataset size')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
def plot_cf(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    plot_confusion_matrix(model, X_test, y_test)  

In [None]:
scorer = make_scorer(sklearn.metrics.mean_absolute_error)

In [None]:
plot_learning_curve(5, grid.best_estimator_, X_train, y_train, 'Learning Curve')

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_cf(grid.best_estimator_, X_train, y_train, X_test, y_test)

In [None]:
dtree = DecisionTreeClassifier(max_depth=3)
dtree.fit(X, y)

In [None]:
plt.figure()
tree.plot_tree(dtree)
plt.rcParams['figure.figsize'] = (30, 20)
plt.show()