In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
matches = pd.read_csv("matches_bundesliga_v2.csv", index_col=0)

In [4]:
matches.shape

(3060, 30)

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
matches["team"].value_counts()

#Timovi koji su svih 5 sezona u ligi trebaju imati 170, 4 - 136, 3 - 102, 2 - 68, 1 - 34

Bayern Munich          170
Augsburg               170
RB Leipzig             170
Hertha BSC             170
Freiburg               170
Bayer Leverkusen       170
Eintracht Frankfurt    170
Wolfsburg              170
Mainz 05               170
Monchengladbach        170
Hoffenheim             170
Dortmund               170
Schalke 04             136
Stuttgart              136
Werder Bremen          136
Koln                   136
Union Berlin           136
Bochum                  68
Arminia                 68
Dusseldorf              68
Greuther Furth          34
Paderborn 07            34
Hannover 96             34
Nurnberg                34
Name: team, dtype: int64

In [7]:
matches.isna().sum()

#Provjera polja bez vrijednosti

date               0
time               0
comp               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 2
xga                2
poss               0
attendance       744
captain            0
formation          0
referee            0
match report       0
notes           3058
sh                 0
sot                0
dist               3
fk                 2
pk                 0
pkatt              0
season             0
team               0
wwages            34
awages            34
age                0
dtype: int64

In [8]:
matches["round"].value_counts()

##Trebao bi biti 90 za svaki matchweek

Matchweek 1     90
Matchweek 26    90
Matchweek 20    90
Matchweek 21    90
Matchweek 22    90
Matchweek 23    90
Matchweek 24    90
Matchweek 25    90
Matchweek 27    90
Matchweek 2     90
Matchweek 28    90
Matchweek 29    90
Matchweek 30    90
Matchweek 31    90
Matchweek 32    90
Matchweek 33    90
Matchweek 19    90
Matchweek 18    90
Matchweek 17    90
Matchweek 16    90
Matchweek 15    90
Matchweek 14    90
Matchweek 13    90
Matchweek 12    90
Matchweek 11    90
Matchweek 10    90
Matchweek 9     90
Matchweek 8     90
Matchweek 7     90
Matchweek 6     90
Matchweek 5     90
Matchweek 4     90
Matchweek 3     90
Matchweek 34    90
Name: round, dtype: int64

In [9]:
del matches["comp"]
del matches["notes"]

In [10]:
matches.loc[matches.attendance.isna()==True, 'attendance'] = round(matches.attendance.mean(), 0)

In [11]:
matches.loc[matches.dist.isna()==True, 'dist'] = round(matches.dist.mean(), 0)

In [12]:
matches.loc[matches.wwages.isna()==True, 'wwages'] = round(matches.wwages.mean(), 0)

In [13]:
matches.loc[matches.awages.isna()==True, 'awages'] = round(matches.awages.mean(), 0)

In [14]:
matches["target"] = (matches["result"] == "W").astype("int")

#W = 1, D/L = 0

In [15]:
matches["date"] = pd.to_datetime(matches["date"])

#Konverzija date iz object u datetime

In [16]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

#Nova kolona venue_code koja je zapravo konverzija objekta venue u int vrijednost (Home = 1, Away = 0)

In [17]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

#Nova kolona opp_code koja svaki tim predstavlja int vrijednoscu (npr. Bayern Munich = 0, Nurnberg = 1...)

In [18]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

#Nova kolona hour koja pretvara vrijeme u int vrijednost (16:30 = 16, 15:45 = 15...)

In [19]:
matches["day_code"] = matches["date"].dt.dayofweek

#Nova kolona day_code koja pretvara dan u int vrijednost (Monday = 0, Tuesday = 1...)

In [20]:
matches["captain_code"] = matches["captain"].astype("category").cat.codes

#Nova kolona captain_code koja pretvara captain-a u int

In [None]:
matches["referee_code"] = matches["referee"].astype("category").cat.codes

#Nova kolona referee_code koja pretvara referee-a u int

In [35]:
# matches["round_code"] = matches["round"].astype("category").cat.codes

In [21]:
matches["formation_code"] = matches["formation"].astype("category").cat.codes

In [22]:
matches["age_int"] = (round(matches["age"], 0)).astype("int")

In [23]:
def categorize_wwages(wwages):
    if wwages > 1400000:
        return 1
    elif 500000 < wwages <= 1400000:
        return 2
    else:
        return 3

In [24]:
def categorize_awages(awages):
    if awages > 100000000:
        return 1
    elif 3500000 < awages <= 100000000:
        return 2
    else:
        return 3

In [25]:
matches['wwages_category'] = matches['wwages'].apply(categorize_wwages)

In [26]:
matches['awages_category'] = matches['awages'].apply(categorize_awages)

In [27]:
predictors = ["venue_code", "opp_code", "hour", "day_code", "captain_code", "referee_code", "formation_code", "age_int", "wwages_category", "awages_category"]

In [28]:
train = matches[matches["date"] < '2023-01-24']
test = matches[matches["date"] > '2023-01-24']

# data za training je sve prije 24.01.2023.
# data za testing je sve poslije 24.01.2023.

In [29]:
test

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,sh,sot,dist,fk,pk,pkatt,season,team,wwages,awages,age,target,venue_code,opp_code,hour,day_code,captain_code,referee_code,formation_code,age_int,wwages_category,awages_category
17,2023-01-28,18:30,Matchweek 18,Sat,Home,D,1,1,Eint Frankfurt,1.6,0.6,67.0,75000.0,Thomas Müller,4-1-4-1,Sven Jablonski,Match Report,14.0,5.0,17.9,0.0,0.0,0.0,2023,Bayern Munich,5047500.0,262470000.0,26.6,0,1,6,18,5,133,26,9,27,1,1
18,2023-02-05,17:30,Matchweek 19,Sun,Away,W,4,2,Wolfsburg,0.5,2.2,48.0,30000.0,Thomas Müller,4-2-3-1,Harm Osmers,Match Report,9.0,4.0,19.6,0.0,0.0,0.0,2023,Bayern Munich,5047500.0,262470000.0,26.6,1,0,23,17,6,133,13,11,27,1,1
19,2023-02-11,15:30,Matchweek 20,Sat,Home,W,3,0,Bochum,3.7,0.3,64.0,75000.0,Thomas Müller,3-1-4-2,Matthias Jöllenbeck,Match Report,22.0,11.0,16.7,0.0,1.0,1.0,2023,Bayern Munich,5047500.0,262470000.0,26.6,1,1,3,15,5,133,18,0,27,1,1
20,2023-02-18,15:30,Matchweek 21,Sat,Away,L,2,3,M'Gladbach,1.1,2.4,47.0,54042.0,Thomas Müller,3-1-4-2,Tobias Welz,Match Report,13.0,5.0,18.4,0.0,0.0,0.0,2023,Bayern Munich,5047500.0,262470000.0,26.6,0,0,14,15,5,133,31,0,27,1,1
21,2023-02-26,17:30,Matchweek 22,Sun,Home,W,3,0,Union Berlin,3.6,0.4,69.0,75000.0,Thomas Müller,4-2-3-1,Marco Fritz,Match Report,20.0,9.0,13.8,0.0,0.0,0.0,2023,Bayern Munich,5047500.0,262470000.0,26.6,1,1,21,17,6,133,15,11,27,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,2023-04-30,15:30,Matchweek 30,Sun,Away,L,0,2,Bayern Munich,0.1,2.7,20.0,75000.0,Marvin Plattenhardt,4-4-2,Patrick Ittrich,Match Report,2.0,1.0,13.3,0.0,0.0,0.0,2023,Hertha BSC,642692.0,33420000.0,26.0,0,0,2,15,6,89,19,16,26,2,2
608,2023-05-06,15:30,Matchweek 31,Sat,Home,W,2,1,Stuttgart,1.2,1.5,30.0,63443.0,Marvin Plattenhardt,4-4-2,Deniz Aytekin,Match Report,7.0,3.0,15.3,0.0,0.0,0.0,2023,Hertha BSC,642692.0,33420000.0,26.0,1,1,20,15,5,89,7,16,26,2,2
609,2023-05-12,20:30,Matchweek 32,Fri,Away,L,2,5,Köln,1.1,4.3,37.0,50000.0,Marvin Plattenhardt,4-4-2,Sven Jablonski,Match Report,9.0,3.0,15.4,0.0,0.0,0.0,2023,Hertha BSC,642692.0,33420000.0,26.0,0,0,12,20,4,89,26,16,26,2,2
610,2023-05-20,15:30,Matchweek 33,Sat,Home,D,1,1,Bochum,1.2,1.1,50.0,70692.0,Kevin-Prince Boateng,4-3-3,Felix Brych,Match Report,14.0,3.0,18.4,0.0,0.0,0.0,2023,Hertha BSC,642692.0,33420000.0,26.0,0,1,3,15,5,61,8,14,26,2,2


In [45]:
classifiers = [      
    RandomForestClassifier(n_estimators=54, min_samples_split=17, random_state=42, max_depth=9),
    AdaBoostClassifier(n_estimators=70, learning_rate=1.826, random_state=42),
    XGBClassifier(n_estimators=62, learning_rate=0.081, random_state=42)
]

In [46]:
for clf in classifiers:
        clf.fit(train[predictors], train["target"])
        accuracy = accuracy_score(test["target"], clf.predict(test[predictors]))
        precision = precision_score(test["target"], clf.predict(test[predictors]))
        print(clf, accuracy, precision)

RandomForestClassifier(max_depth=9, min_samples_split=17, n_estimators=54,
                       random_state=42) 0.7151898734177216 0.8181818181818182
AdaBoostClassifier(learning_rate=1.826, n_estimators=70, random_state=42) 0.7183544303797469 0.6790123456790124
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.081, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=62, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_

In [36]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=clf.predict(test[predictors])))

In [75]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

# sta smo predvidjeli tacno, a sta ne?

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,209,33
1,90,50
