In [1]:
import sys
sys.path.append(".")

import numpy as np
import pandas as pd
from probability_estimator import ProbabilityEstimator
from environment import Environment

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List
from sklearn.linear_model import LogisticRegression, Ridge
import sklearn
import sklearn.compose
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.metrics
import sklearn.model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from lukas import DataExtraction # Start writing code here...

import xgboost as xgb

import torch
import torch.nn as nn

In [2]:
seed = 42
generator = np.random.RandomState(seed)

In [3]:
dataset = pd.read_csv('../data/training_data.csv', parse_dates=['Date', 'Open'])

In [4]:
dataset = dataset.drop(dataset[(dataset.H == False) & (dataset.A == False)].index)

In [231]:
class ModelWrapper:
    def __init__(self):
        self.team_stats = {}
        for i in range(30): self.team_stats[i] = {"wins": 0, "shots": 0, "won_buly": 0, "penalty": 0, "matches": 0, "odds": 0}
        self.model = sklearn.linear_model.LogisticRegression(max_iter=800, random_state=42)
        # self.bookmaker_h = sklearn.linear_model.LinearRegression()
        # self.bookmaker_a = sklearn.linear_model.LinearRegression()

        self.poly = sklearn.preprocessing.PolynomialFeatures(2)
        self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        
        self.data = np.empty(shape=(0, 18))
        self.home_wins = np.empty(shape=(0,1))
        self.oddsh = np.empty(shape=(0,1))
        self.oddsa = np.empty(shape=(0,1))
        self.reset_factor = 1.4

    def reset_stats(self, team_id):
        self.team_stats[team_id]["wins"] /= self.reset_factor
        self.team_stats[team_id]["shots"] /= self.reset_factor
        self.team_stats[team_id]["penalty"] /= self.reset_factor
        self.team_stats[team_id]["matches"] //= self.reset_factor
        self.team_stats[team_id]["odds"] //= self.reset_factor
        
    def add_stats(self, team_id, match, is_home):
        home = ["S_H", "FOW_H", "PIM_H", "OddsH"]
        away = ["S_A", "FOW_A", "PIM_A", "OddsA"]
        if is_home: keys = home
        else: keys = away
        self.team_stats[team_id]["shots"] += match[keys[0]]
        self.team_stats[team_id]["won_buly"] += match[keys[1]]
        self.team_stats[team_id]["penalty"] += match[keys[2]]
        self.team_stats[team_id]["odds"] += match[keys[3]]
        self.team_stats[team_id]["matches"] += 1

    def process_new_matches(self, inc):
        attributes = ("S", "PIM", "PPG", "FOW")
        teams = set()
        for i, r in enumerate(dataset.iterrows()):
            match = r[1]
            teams.add(match["HID"])
            teams.add(match["AID"])
        d = {}
        for team in teams:
            teamh = dataset.loc[(dataset["HID"] == team)]
            teama = dataset.loc[(dataset["AID"] == team)]
            tot = len(teamh) + len(teama)
            for a in attributes:
                if not team in d:
                    d[team] = {}
                val = (np.sum(teamh[a + "_H"]) + np.sum(teamh[a + "_A"])) / tot
                d[team][a] = val
            val = (np.sum(teamh["HSC"]) + np.sum(teamh["ASC"])) / tot
            d[team]["SC"] = val
        for i, r in enumerate(inc.iterrows()):
            if i % 500 == 0:
                if i > 500:
                    dataset2 = inc.tail(500)
                else:
                    dataset2 = inc
                attributes = ("S", "PIM", "PPG", "FOW")
                global d
                d = {}
                for team in teams:
                    teamh = dataset2.loc[(dataset2["HID"] == team)]
                    teama = dataset2.loc[(dataset2["AID"] == team)]
                    tot = len(teamh) + len(teama)
                    for a in attributes:
                        if not team in d:
                            d[team] = {}
                        val = (np.sum(teamh[a + "_H"]) + np.sum(teamh[a + "_A"])) / tot
                        d[team][a] = val
                    val = (np.sum(teamh["HSC"]) + np.sum(teamh["ASC"])) / tot
                    d[team]["SC"] = val
            match = r[1]
            th, ta = match["HID"], match["AID"] 

            # ignore games with draw
            if (match["H"] or match["A"]):
                # +1 to avoid div zero error
                mh, ma = self.team_stats[th]["matches"] + 1, self.team_stats[ta]["matches"] + 1
                new_row = np.array([
                    [self.team_stats[th]["wins"]/mh, 
                     self.team_stats[th]["shots"]/mh, 
                    self.team_stats[th]["won_buly"]/mh, 
                     self.team_stats[th]["odds"]/mh,
                    self.team_stats[ta]["wins"]/ma, 
                     self.team_stats[ta]["shots"]/ma, 
                    self.team_stats[ta]["won_buly"]/ma, 
                     self.team_stats[ta]["odds"]/ma,
                     d[th]["S"],
                     d[ta]["S"],
                     d[th]["PIM"],
                    d[ta]["PIM"],
                    d[th]["PPG"],
                    d[ta]["PPG"],
                    d[th]["FOW"],
                    d[ta]["FOW"],
                    d[th]["SC"],
                    d[ta]["SC"],
                    ]
                ])
                #print(self.team_stats[ta]["wins"]/ma)
                #print(match["H"])
                self.data = np.append(self.data, new_row, axis=0)
                
                if match["H"]: self.home_wins = np.append(self.home_wins, 0)
                else: self.home_wins = np.append(self.home_wins, 1)
                    
                        
            # lower the importance of old matches
            if self.team_stats[th]["matches"] % 30==0: self.reset_stats(th)
            if self.team_stats[ta]["matches"] % 30==0: self.reset_stats(ta)
                
            if match["H"]: self.team_stats[th]["wins"] += 1 
            self.add_stats(th, match, True)
            
            if match["A"]: self.team_stats[ta]["wins"] += 1 
            self.add_stats(ta, match, False)

                    
    def predict(self, th, ta):
        mh, ma = self.team_stats[th]["matches"], self.team_stats[ta]["matches"]
        inp = self.poly.transform([[
                self.team_stats[th]["wins"]/mh, self.team_stats[th]["shots"]/mh, 
                self.team_stats[th]["won_buly"]/mh, self.team_stats[th]["odds"]/mh,
                self.team_stats[th]["wins"]/ma, self.team_stats[th]["shots"]/ma, 
                self.team_stats[th]["won_buly"]/ma, self.team_stats[th]["odds"]/ma,
        ]])
        
        inp = self.scaler.transform(inp)
        return self.model.predict_proba(inp)

    def fit(self):
        inp = self.poly.fit_transform(self.data)
        inp = self.scaler.fit_transform(inp)
        self.model.fit(inp, self.home_wins)

    def place_bets(self, opps, summary, inc):
        self.process_new_matches(inc)
        self.fit( 0)
        
        N = len(opps)
        min_bet = summary.iloc[0].to_dict()['Min_bet']
        bets = np.zeros((N, 2))
        
        for idx, row in enumerate(opps.iterrows()):
            match = row[1]
            match_date = match["Date"]
            
            team_a, team_b = match["HID"], match["AID"]
            prediciton = self.predict(team_a, team_b)[0]

            if prediciton[0] >= 0.8: bets[idx, 0] = min_bet*(1+prediciton[0])
            elif prediciton[1] >= 0.8: bets[idx, 1] = min_bet*(1+prediciton[1])
        return pd.DataFrame(data=bets, columns=['BetH', 'BetA'], index=opps.index)

    def evaluate(self):
        model = [
                    
            ("scaler", sklearn.preprocessing.RobustScaler()),
                 ("poly", sklearn.preprocessing.PolynomialFeatures(2)),
                  ("lr", sklearn.linear_model.LogisticRegression(max_iter=1000)),
#                 ("xgb",  xgb.XGBClassifier(objective="binary:logistic", 
#                                            colsample_bytree=0.5,
#                                            gamma=0.2,
#                                            learnin_rate=0.01,
#                                            max_depth=2,
#                                            reg_lambda=0.1,
#                                            scale_pos_weight=0.5,
#                                            subsample=0.8
#                                           )),

                ]
        pipeline = sklearn.pipeline.Pipeline(model)
        print(self.data.shape)
        results = sklearn.model_selection.cross_validate(pipeline, self.data, self.home_wins, cv=5)
        return np.average(results['test_score'])
    
    def manual_evaluate(self):
        train_data, test_data, train_target, test_target= sklearn.model_selection.train_test_split(self.data, 
                                                            self.home_wins, test_size=0.2, random_state=42)
        model = [("scaler", sklearn.preprocessing.RobustScaler()),
                 ("poly", sklearn.preprocessing.PolynomialFeatures(2)),
                  ("lr", sklearn.linear_model.LogisticRegression(max_iter=1500)),
#                   ("xgb",  xgb.XGBClassifier(objective="binary:logistic", 
#                                            colsample_bytree=0.5,
#                                            gamma=0.2,
#                                            learnin_rate=0.01,
#                                            max_depth=2,
#                                            reg_lambda=0.1,
#                                            scale_pos_weight=0.5,
#                                            subsample=0.8
#                                           )
#                   )
                ]
        pipeline = sklearn.pipeline.Pipeline(model)
        pipeline.fit(train_data, train_target)
        print(self.data)
        return pipeline.score(test_data, test_target)


SyntaxError: name 'd' is used prior to global declaration (3799772255.py, line 65)

In [232]:
{'xgb__colsample_bytree': 0.5,
 'xgb__gamma': 0.2,
 'xgb__learning_rate': 0.01,
 'xgb__max_depth': 2,
 'xgb__reg_lambda': 0.1,
 'xgb__scale_pos_weight': 0.5,
 'xgb__subsample': 0.8}

{'xgb__colsample_bytree': 0.5,
 'xgb__gamma': 0.2,
 'xgb__learning_rate': 0.01,
 'xgb__max_depth': 2,
 'xgb__reg_lambda': 0.1,
 'xgb__scale_pos_weight': 0.5,
 'xgb__subsample': 0.8}

In [228]:
model = ModelWrapper()
model.process_new_matches(dataset)

In [229]:
model.evaluate()

(5336, 18)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.5755295922271482

In [186]:
model.manual_evaluate()

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [4.93939696e-01 2.73644582e+01 5.15391304e+02 ... 2.79891293e+01
  4.38153846e+02 1.67461538e+00]
 [2.81168188e-01 3.04949306e+01 4.04178571e+02 ... 2.90430306e+01
  5.07043478e+02 2.49478261e+00]
 [6.00783843e-01 3.06590743e+01 3.99714286e+02 ... 3.55383778e+01
  4.87760000e+02 1.49040000e+00]]


0.5833333333333334

In [156]:
model.home_wins[:50]

array([1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1.,
       0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.])

In [65]:
model.data

array([[0.00000000e+00, 2.10000000e+01, 3.90000000e+01, ...,
        2.80000000e+01, 3.50000000e+01, 0.00000000e+00],
       [1.00000000e+00, 3.30000000e+01, 3.40000000e+01, ...,
        2.90000000e+01, 2.70000000e+01, 0.00000000e+00],
       [1.00000000e+00, 3.00000000e+01, 3.50000000e+01, ...,
        2.10000000e+01, 3.60000000e+01, 0.00000000e+00],
       ...,
       [2.16784841e+00, 8.98245516e+01, 1.18850000e+04, ...,
        1.00014269e+02, 1.14180000e+04, 5.60000000e+00],
       [1.87297434e+00, 1.35645308e+02, 1.13430000e+04, ...,
        1.13784331e+02, 1.16870000e+04, 7.25000000e+00],
       [3.14279059e+00, 1.09410967e+02, 1.12220000e+04, ...,
        1.19192252e+02, 1.22200000e+04, 3.73000000e+00]])

In [81]:
np.sum(model.home_wins)

2334.0

In [84]:
model.data[:40]

array([[  0.        ,  21.        ,  39.        ,   0.        ,
          1.        ,  28.        ,  35.        ,   0.        ],
       [  1.        ,  33.        ,  34.        ,   0.        ,
          0.        ,  29.        ,  27.        ,   0.        ],
       [  1.        ,  30.        ,  35.        ,   0.        ,
          0.        ,  21.        ,  36.        ,   0.        ],
       [  0.        ,  24.        ,  37.        ,   0.        ,
          1.        ,  41.71428571,  57.        ,   0.        ],
       [  1.        ,  20.        ,  24.        ,   0.        ,
          0.        ,  33.        ,  40.        ,   0.        ],
       [  0.        ,  26.        ,   0.        ,   0.        ,
          1.        ,  33.        ,   0.        ,   0.        ],
       [  1.        ,  34.        ,  34.        ,   0.        ,
          0.        ,  30.        ,  34.        ,   0.        ],
       [  1.        ,  29.        ,  42.        ,   0.        ,
          0.71428571,  54.       

In [164]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.linear = nn.Linear(45, 1)
    def forward(self, x):
        out = self.linear(x)
        return out

mlp = MLP().to(device)
optimizer = torch.optim.Adam(mlp.parameters())
criterion = nn.CrossEntropyLoss()

print(mlp)

MLP(
  (linear): Linear(in_features=45, out_features=1, bias=True)
)


In [157]:
train_data, test_data, train_target, test_target= sklearn.model_selection.train_test_split(model.data, 
                                                            model.home_wins, test_size=0.2, random_state=42)

In [159]:
model = [("scaler", sklearn.preprocessing.RobustScaler()),
                 ("poly", sklearn.preprocessing.PolynomialFeatures(2)),
                ]
pipeline = sklearn.pipeline.Pipeline(model)


In [161]:
train_data = pipeline.fit_transform(train_data)
test_data = pipeline.transform(test_data)

In [163]:
train_data.shape

(4268, 45)