In [151]:
import pickle
import random

import numpy as np

matches_df = pickle.load(open("database/matches_df.pck", "rb"))

In [152]:
winner_list = []
market_val_diff = []
goals_diff = []
for _, row in matches_df.iterrows():
    if row["Team Home ID"] == row["Winner Team ID"]:
        winner_list.append("Home")
    elif row["Team Away ID"] == row["Winner Team ID"]:
        winner_list.append("Away")
    elif row["Winner Team ID"] == 0:
        winner_list.append("Draw")
    market_val_diff.append(row["Market Value Home"] - row["Market Value Away"])
    goals_diff.append(row["Goals Home"] - row["Goals Away"])
matches_df["Result"] = winner_list
matches_df["Market Value Difference"] = market_val_diff
matches_df["Goals Difference"] = goals_diff

In [153]:
import asyncio
import nest_asyncio
from database import dbconn
import importlib

importlib.reload(dbconn)

nest_asyncio.apply()

loop = asyncio.get_event_loop()



In [154]:
import pandas as pd

dict_team_data = dict()

for team_id in set(matches_df["Team Home ID"]):
    team_data_matches_home, team_data_matches_away = loop.run_until_complete(dbconn.get_matches_by_team(team_id))
    team_data_matches = [[x.id, x.goalsHome, x.goalsAway, x.winnerTeamId] for x in team_data_matches_home] + [[x.id, x.goalsHome, x.goalsAway, x.winnerTeamId] for x in team_data_matches_away]
    team_data_matches = sorted(team_data_matches, key=lambda x: x[0])
    for i in range(len(team_data_matches)):
        if team_data_matches[i][3] == team_id:
            team_data_matches[i][3] = "Won"
        elif team_data_matches[i][3] == None:
            team_data_matches[i][3] = "Draw"
        else:
            team_data_matches[i][3] = "Lost"
    dict_team_data[team_id] = pd.DataFrame(team_data_matches)

In [155]:
matches_form = []
for _, row in matches_df.iterrows():
    home_form = dict_team_data[row["Team Home ID"]]
    home_form = home_form[home_form[0] < row["Match ID"]].iloc[-5:]
    home_form_5 = list(home_form[3])
    for i in range(5 - len(home_form_5)):
        home_form_5 = ["Draw"] + home_form_5
    away_form = dict_team_data[row["Team Away ID"]]
    away_form = away_form[away_form[0] < row["Match ID"]].iloc[-5:]
    away_form_5 = list(away_form[3])
    for i in range(5 - len(away_form_5)):
        away_form_5 = ["Draw"] + away_form_5
    matches_form.append(home_form_5 + away_form_5)

In [156]:
columns_added = ["Home_Pre" + str(x) for x in range(5, 0, -1)] + ["Away_Pre" + str(x) for x in range(5, 0, -1)]
matches_df = pd.concat([matches_df, pd.DataFrame(matches_form, columns=columns_added)], axis=1)

In [157]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

end_goals = []
for _, row in matches_df.iterrows():
    end_goals.append(str(row["Goals Home"]) + ":" + str(row["Goals Away"]))
result_enc = LabelEncoder()
end_goals = result_enc.fit_transform(end_goals)
matches_df["Goals Result"] = end_goals

In [158]:
res_enc = LabelEncoder()
form_enc = LabelEncoder()
matches_df["Result"] = res_enc.fit_transform(matches_df["Result"])
form_enc.fit(matches_df["Home_Pre5"])
matches_df["Home_Pre5"] = form_enc.transform(matches_df["Home_Pre5"])
for column in columns_added[1:]:
    matches_df[column] = form_enc.transform(matches_df[column])
X_train, X_test, y_train, y_test = train_test_split(matches_df.drop(["Goals Home", "Goals Away", "Goals Difference", "Result", "Winner Team ID", "Match ID", "Team Home ID", "Team Away ID", "Goals Result"], axis=1), matches_df["Goals Result"], random_state=42, test_size=0.2)

In [159]:
from sklearn.metrics import make_scorer

def score(y_true, y_pred, **kwargs):
    y_true = [x.split(":") for x in result_enc.inverse_transform(y_true)]
    y_pred = [x.split(":") for x in result_enc.inverse_transform(y_pred)]
    score_value = 0
    for true, pred in zip(y_true, y_pred):
        if true[0] == pred[0] and true[1] == pred[1]:
            score_value += 5
        elif (int(true[0]) - int(true[1])) == (int(pred[0]) - int(pred[1])):
            score_value += 3
        elif ((true[0] > true[1]) and (pred[0] > pred[1])) or ((true[0] < true[1]) and (pred[0] < pred[1])):
            score_value += 1
    return round(score_value / (len(y_true)/306))

kicktipp_scorer = make_scorer(score, greater_is_better=True)

In [160]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import *

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score(y_test, y_pred), accuracy_score(y_test, y_pred)

(282, 0.06372549019607843)

In [161]:
from sklearn.ensemble import *

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score(y_test, y_pred), accuracy_score(y_test, y_pred)

(278, 0.07516339869281045)

In [162]:
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score(y_test, y_pred), accuracy_score(y_test, y_pred)

(273, 0.07026143790849673)

In [163]:
clf = AdaBoostClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score(y_test, y_pred), accuracy_score(y_test, y_pred)

(282, 0.1111111111111111)

In [164]:
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
cross_val_score(clf, X_train, y_train, cv=tscv)

array([0.13480392, 0.09313725, 0.10784314, 0.12009804, 0.1127451 ])

In [165]:
from sklearn.neighbors import *

clf = KNeighborsClassifier(n_neighbors=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score(y_test, y_pred), accuracy_score(y_test, y_pred)

(304, 0.09803921568627451)

In [166]:
cross_val_score(clf, X_test, y_test, scoring=kicktipp_scorer)



array([331, 294, 324, 266, 306])

In [169]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[("clf", KNeighborsClassifier())])

In [170]:
cross_val_score(pipe, X_test, y_test, scoring=kicktipp_scorer)



array([351, 401, 336, 283, 316])

In [175]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "clf__n_neighbors": np.arange(1, 250, 1)
}

clf = GridSearchCV(pipe, param_grid, n_jobs=2).fit(X_train, y_train)
y_pred = clf.predict(X_test)
score(y_test, y_pred), accuracy_score(y_test, y_pred)



(302, 0.09803921568627451)