In [2]:
import pandas as pd

In [15]:
data = pd.read_pickle('input_data.pickle')

In [4]:
import numpy as np

features = data.drop(columns=["result", "odds_home", "odds_draw", "odds_away"])

X = np.array(features)

y = np.array(data["result"])
print(X[0])

[15 3 16 4 4 10 81 60 50 57 24 -1]


In [25]:
# 1. NAIVE BAYES

from sklearn.naive_bayes import GaussianNB

naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X, y)

In [31]:
print(list(naive_bayes_model.predict_proba([[5, 15, 20, 20, 5, 5, 50, 50, 50, 50, 10, 10]])[0]))

print(naive_bayes_model.classes_)

[0.5007171199332611, 0.09894333522515908, 0.40033954484158124]
['away win' 'draw' 'home win']


In [75]:
def return_profit(odds, probabilites, final):
    bet_on_1 = odds[0] - (1 / probabilites[0])
    bet_on_2 = odds[1] - (1 / probabilites[1])
    bet_on_3 = odds[2] - (1 / probabilites[2])

    if bet_on_1 == max([bet_on_1, bet_on_2, bet_on_3]):
        if bet_on_1 > 0:
            if final == "home win":
                return odds[0]
            else:
                return -1
        else:
            return 0
    elif bet_on_2 == max([bet_on_1, bet_on_2, bet_on_3]):
        if bet_on_2 > 0:
            if final == "draw":
                return odds[1]
            else:
                return -1
        else:
            return 0
    elif bet_on_3 == max([bet_on_1, bet_on_2, bet_on_3]):
        if bet_on_3 > 0:
            if final == "away win":
                return odds[2]
            else:
                return -1
        else:
            return 0

    else:
        raise ValueError

In [78]:
def test_model(learn_data, test_data, model):
    features = learn_data.drop(columns=["result", "odds_home", "odds_draw", "odds_away"])
    X = np.array(features)
    y = np.array(learn_data["result"])

    model.fit(X, y)

    brier_score = 0
    confusion_matrix = [[0,0,0], [0,0,0], [0,0,0]]
    profit = 0

    test_features = test_data.drop(columns=["result", "odds_home", "odds_draw", "odds_away"])
    test_X = np.array(test_features)
    test_y = np.array(test_data["result"])
    test_odds = np.array(test_data[["odds_home", "odds_draw", "odds_away"]])
    
    for id, line in enumerate(test_X):
        probabilites = list(reversed(list(model.predict_proba(line.reshape(1, -1))[0])))

        if max(probabilites) == probabilites[0]:
            predicted = "home win"
            c_matrix_pred = 0
        elif max(probabilites) == probabilites[1]:
            predicted = "draw"
            c_matrix_pred = 1
        elif max(probabilites) == probabilites[2]:
            predicted = "away win"
            c_matrix_pred = 2
        else:
            raise ValueError

        if test_y[id] == "home win":
            brier__ = [1, 0, 0]
            c_matrix_act = 0
        elif test_y[id] == "draw":
            brier__ = [0, 1, 0]
            c_matrix_act = 1
        elif test_y[id] == "away win":
            brier__ = [0, 0, 1]
            c_matrix_act = 2
        else:
            raise ValueError


        # calculate brier score
        for i in range(3):
            brier_score += (probabilites[i] - brier__[i]) ** 2

        # confusion matrix
        confusion_matrix[c_matrix_act][c_matrix_pred] += 1

        odds = test_odds[id]
        final_result = test_y[id]
        
        # calculate profit
        profit += return_profit(odds, probabilites, final_result)
        

    # calculate rest of the metrics
    precision = confusion_matrix[0][0] / (confusion_matrix[0][0] + confusion_matrix[1][0] + confusion_matrix[2][0])
    recall = confusion_matrix[0][0] / (confusion_matrix[0][0] + confusion_matrix[0][1] + confusion_matrix[0][2])

    F1 = 2 * (precision * recall) / (precision + recall)

    return brier_score, precision, recall, F1, confusion_matrix, profit



In [81]:
test_data = data[:1000]
learn_data = data[1000:]

brier_score, precision, recall, F1, confusion_matrix, profit = test_model(learn_data, test_data, GaussianNB())

from pprint import pprint

pprint(confusion_matrix)

[[309, 31, 99], [112, 30, 72], [103, 29, 142]]


In [33]:
# 2. NEURAL NETWORKS

from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
mlp_model.fit(X, y)

In [34]:
print(list(mlp_model.predict_proba([[5, 15, 20, 20, 5, 5, 50, 50, 50, 50, 10, 10]])[0]))

print(mlp_model.classes_)

[0.2863816894568854, 0.25537105689653405, 0.45824725364658053]
['away win' 'draw' 'home win']


In [37]:
# 3. K-NN

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=15)

knn_model.fit(X, y)

In [38]:
print(list(knn_model.predict_proba([[5, 15, 20, 20, 5, 5, 50, 50, 50, 50, 10, 10]])[0]))

print(knn_model.classes_)

[0.13333333333333333, 0.4, 0.4666666666666667]
['away win' 'draw' 'home win']


In [54]:
# 4. BAYESIAN NETWORK

from pomegranate import *

features2 = data.drop(columns=["odds_home", "odds_draw", "odds_away", 
                              "shots_given_home", "shots_given_away", 
                              "shots_conceded_home", "shots_conceded_away", 
                              "corners_difference_home", "corners_difference_away"])

X_ = np.array(features2)

bayes_net_model = BayesianNetwork.from_samples(X_)

X_[0]

array(['home win', 15, 3, 16, 4, 4, 10], dtype=object)

In [55]:
print(bayes_net_model.predict([[None, 15, 3, 16, 4, 4, 10]])[0])

['home win' 15 3 16 4 4 10]


In [56]:
len(data)

3316

In [58]:
import pandas as pd
from math import ceil, floor


def k_fold_cross_validation(k, data_):

    data = data_.reset_index()

    i = len(data) / k

    test_data = data[:floor(i)]
    learn_data = data[floor(i):]
    

    # TODO: make models

    # TODO: test od test_data

    # TODO: sum all the indicators


    for j in range(1, k - 1):
        bound1 = ceil((j * i))
        bound2 = floor((j + 1) * i)

        test_data = data[bound1:bound2]

        learn_data_1 = data[:bound1]
        learn_data_2 = data[bound2:]
        learn_data = pd.concat([learn_data_1, learn_data_2])
        

        # TODO: make models

        # TODO: test od test_data
    
        # TODO: sum all the indicators

    test_data = data[ceil(((k - 1) * i)):]
    learn_data = data[:ceil(((k - 1) * i))]
    
    
    # TODO: make models

    # TODO: test od test_data

    # TODO: sum all the indicators


    # TODO: return averages
    return


k_fold_cross_validation(10, data)

2985
2985
2986
2985
2985
2985
2985
2986
2985
2985
