In [1]:
import pickle
import random

import numpy as np

matches_df = pickle.load(open("database/matches_df.pck", "rb"))

In [2]:
winner_list = []
market_val_diff = []
goals_diff = []
for _, row in matches_df.iterrows():
    if row["Team Home ID"] == row["Winner Team ID"]:
        winner_list.append("Home")
    elif row["Team Away ID"] == row["Winner Team ID"]:
        winner_list.append("Away")
    elif row["Winner Team ID"] == 0:
        winner_list.append("Draw")
    market_val_diff.append(row["Market Value Home"] - row["Market Value Away"])
    goals_diff.append(row["Goals Home"] - row["Goals Away"])
matches_df["Result"] = winner_list
matches_df["Market Value Difference"] = market_val_diff
matches_df["Goals Difference"] = goals_diff

In [3]:
import asyncio
import nest_asyncio
from database import dbconn
import importlib

importlib.reload(dbconn)

nest_asyncio.apply()

loop = asyncio.get_event_loop()



In [4]:
import pandas as pd

dict_team_data = dict()

for team_id in set(matches_df["Team Home ID"]):
    team_data_matches_home, team_data_matches_away = loop.run_until_complete(dbconn.get_matches_by_team(team_id))
    team_data_matches = [[x.id, x.goalsHome, x.goalsAway, x.winnerTeamId] for x in team_data_matches_home] + [[x.id, x.goalsHome, x.goalsAway, x.winnerTeamId] for x in team_data_matches_away]
    team_data_matches = sorted(team_data_matches, key=lambda x: x[0])
    for i in range(len(team_data_matches)):
        if team_data_matches[i][3] == team_id:
            team_data_matches[i][3] = "Won"
        elif team_data_matches[i][3] == None:
            team_data_matches[i][3] = "Draw"
        else:
            team_data_matches[i][3] = "Lost"
    dict_team_data[team_id] = pd.DataFrame(team_data_matches)

In [5]:
matches_form = []
for _, row in matches_df.iterrows():
    home_form = dict_team_data[row["Team Home ID"]]
    home_form = home_form[home_form[0] < row["Match ID"]].iloc[-5:]
    home_form_5 = list(home_form[3])
    for i in range(5 - len(home_form_5)):
        home_form_5 = ["Draw"] + home_form_5
    away_form = dict_team_data[row["Team Away ID"]]
    away_form = away_form[away_form[0] < row["Match ID"]].iloc[-5:]
    away_form_5 = list(away_form[3])
    for i in range(5 - len(away_form_5)):
        away_form_5 = ["Draw"] + away_form_5
    matches_form.append(home_form_5 + away_form_5)

In [6]:
columns_added = ["Home_Pre" + str(x) for x in range(5, 0, -1)] + ["Away_Pre" + str(x) for x in range(5, 0, -1)]
matches_df = pd.concat([matches_df, pd.DataFrame(matches_form, columns=columns_added)], axis=1)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *

res_enc = LabelEncoder()
form_enc = LabelEncoder()
matches_df["Result"] = res_enc.fit_transform(matches_df["Result"])
form_enc.fit(matches_df["Home_Pre5"])
matches_df["Home_Pre5"] = form_enc.transform(matches_df["Home_Pre5"])
for column in columns_added[1:]:
    matches_df[column] = form_enc.transform(matches_df[column])
X_train, X_test, y_train, y_test = train_test_split(matches_df.drop(["Goals Home", "Goals Away", "Goals Difference", "Result", "Winner Team ID", "Match ID", "Team Home ID", "Team Away ID"], axis=1), matches_df["Result"], random_state=42, test_size=0.2)

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import *

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.48366013071895425

In [9]:
from sklearn.ensemble import *

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.47549019607843135

In [10]:
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.4934640522875817

In [11]:
clf = AdaBoostClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.5081699346405228

In [12]:
from sklearn.neighbors import *

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.4444444444444444

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [62]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(max_mem_size_GB=8)

train, test = train_test_split(matches_df.drop(["Goals Home", "Goals Away", "Winner Team ID", "Match ID", "Team Home ID", "Team Away ID", "Result"], axis=1), random_state=42, test_size=0.2)

train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

clf = H2OAutoML(max_models=20, max_runtime_secs=1200)
clf.train(training_frame=train, y="Goals Difference")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 45 mins
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.3
H2O_cluster_version_age:,1 month and 7 days
H2O_cluster_name:,H2O_from_python_maxlautenbach_iulu9c
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.908 Gb
H2O_cluster_total_cores:,10
H2O_cluster_allowed_cores:,10


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
13:38:55.742: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
CPU times: user 1.2 s, sys: 273 ms, total: 1.47 s
Wall time: 3min 44s


key,value
Stacking strategy,cross_validation
Number of base models (used / total),4/5
# GBM base models (used / total),1/1
# GLM base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# DRF base models (used / total),1/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,1.3911365,0.0581399,1.3337193,1.4417789,1.4465504,1.3255492,1.4080849
mean_residual_deviance,3.11757,0.2541147,2.8811703,3.349163,3.3433487,2.8155289,3.198638
mse,3.11757,0.2541147,2.8811703,3.349163,3.3433487,2.8155289,3.198638
null_deviance,1921.4034,139.87204,1893.7223,1987.6656,2128.3535,1790.6152,1806.6606
r2,0.2016788,0.0347442,0.2483742,0.1893066,0.2019524,0.2151744,0.1535863
residual_deviance,1526.4095,131.7259,1414.6547,1607.5983,1698.4211,1382.4247,1528.949
rmse,1.7644767,0.072389,1.697401,1.8300718,1.8284826,1.6779538,1.7884737
rmsle,,0.0,,,,,


In [43]:
y_pred = clf.predict(test)
y_pred = y_pred.as_data_frame().round()
y_pred

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,predict
0,2.0
1,-0.0
2,-2.0
3,-0.0
4,1.0
...,...
607,1.0
608,1.0
609,0.0
610,0.0


In [64]:
res_df = pd.concat([test.as_data_frame()["Goals Difference"], y_pred], axis=1)
res_df.join(matches_df[["Goals Home", "Goals Away"]])

Unnamed: 0,Goals Difference,predict,Goals Home,Goals Away
0,1,2.0,2,1
1,0,-0.0,2,1
2,-1,-2.0,0,1
3,0,-0.0,2,2
4,1,1.0,1,1
...,...,...,...,...
607,-1,1.0,3,1
608,2,1.0,3,1
609,-1,0.0,3,2
610,0,0.0,2,1


In [59]:
count = 0
count2 = 0
for _, row in res_df.iterrows():
    if row["Goals Difference"] == row["predict"]:
        count += 1
        count2 += 3
    elif (row["Goals Difference"] < 0 and row["predict"] < 0) or (row["Goals Difference"] > 0 and row["predict"] > 0) or (row["Goals Difference"] == 0 and row["predict"] == 0):
        count += 1
        count2 += 1


In [60]:
count / len(res_df)

0.43137254901960786

In [61]:
(count2 / len(res_df)) * 306

276.0