In [1]:
# import required libraries

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score, confusion_matrix, roc_auc_score, classification_report, log_loss

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
matches = pd.read_csv("matches_laliga_v2.csv", index_col=0)

In [4]:
matches.shape

(3800, 30)

In [5]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,dist,fk,pk,pkatt,season,team,wwages,awages,age
0,2022-08-13,21:00,La Liga,Matchweek 1,Sat,Home,D,0.0,0.0,Rayo Vallecano,...,5.0,17.0,1.0,0.0,0.0,2023,Barcelona,5163654.0,268510000.0,25.9
1,2022-08-21,22:00,La Liga,Matchweek 2,Sun,Away,W,4.0,1.0,Real Sociedad,...,7.0,14.6,0.0,0.0,0.0,2023,Barcelona,5163654.0,268510000.0,25.9
2,2022-08-28,19:30,La Liga,Matchweek 3,Sun,Home,W,4.0,0.0,Valladolid,...,9.0,14.4,1.0,0.0,0.0,2023,Barcelona,5163654.0,268510000.0,25.9
3,2022-09-03,21:00,La Liga,Matchweek 4,Sat,Away,W,3.0,0.0,Sevilla,...,5.0,16.0,2.0,0.0,0.0,2023,Barcelona,5163654.0,268510000.0,25.9
4,2022-09-10,18:30,La Liga,Matchweek 5,Sat,Away,W,4.0,0.0,Cádiz,...,8.0,14.9,0.0,0.0,0.0,2023,Barcelona,5163654.0,268510000.0,25.9


In [6]:
matches["team"].value_counts()

Barcelona          190
Celta Vigo         190
Atletico Madrid    190
Real Sociedad      190
Villarreal         190
Real Betis         190
Athletic Club      190
Valencia           190
Getafe             190
Real Madrid        190
Sevilla            190
Valladolid         152
Alaves             152
Levante            152
Osasuna            152
Espanyol           152
Elche              114
Eibar              114
Granada            114
Cadiz              114
Rayo Vallecano     114
Mallorca           114
Girona              76
Huesca              76
Leganes             76
Almeria             38
Name: team, dtype: int64

In [7]:
matches.isna().sum()

date               0
time               0
comp               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 0
xga                0
poss               0
attendance       974
captain            0
formation          0
referee            0
match report       0
notes           3800
sh                 0
sot                0
dist               2
fk                 0
pk                 0
pkatt              0
season             0
team               0
wwages            38
awages            38
age                0
dtype: int64

In [8]:
matches["round"].value_counts()

Matchweek 1     100
Matchweek 29    100
Matchweek 22    100
Matchweek 23    100
Matchweek 24    100
Matchweek 25    100
Matchweek 26    100
Matchweek 27    100
Matchweek 28    100
Matchweek 30    100
Matchweek 2     100
Matchweek 31    100
Matchweek 32    100
Matchweek 33    100
Matchweek 34    100
Matchweek 35    100
Matchweek 36    100
Matchweek 37    100
Matchweek 21    100
Matchweek 20    100
Matchweek 17    100
Matchweek 19    100
Matchweek 3     100
Matchweek 4     100
Matchweek 5     100
Matchweek 6     100
Matchweek 7     100
Matchweek 8     100
Matchweek 9     100
Matchweek 10    100
Matchweek 11    100
Matchweek 12    100
Matchweek 13    100
Matchweek 14    100
Matchweek 15    100
Matchweek 16    100
Matchweek 18    100
Matchweek 38    100
Name: round, dtype: int64

In [9]:
del matches["comp"]
del matches["notes"]

In [10]:
matches.loc[matches.attendance.isna()==True, 'attendance'] = round(matches.attendance.mean(), 0)

In [11]:
matches.loc[matches.dist.isna()==True, 'dist'] = round(matches.dist.mean(), 0)

In [12]:
matches.loc[matches.wwages.isna()==True, 'wwages'] = round(matches.wwages.mean(), 0)

In [13]:
matches.loc[matches.awages.isna()==True, 'awages'] = round(matches.awages.mean(), 0)

In [14]:
matches["target"] = (matches["result"] == "W").astype("int")

In [15]:
matches["date"] = pd.to_datetime(matches["date"])

In [16]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [17]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [18]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [19]:
matches["day_code"] = matches["date"].dt.dayofweek

In [20]:
matches["captain_code"] = matches["captain"].astype("category").cat.codes
matches["referee_code"] = matches["referee"].astype("category").cat.codes

In [21]:
# matches["comp_code"] = matches["comp"].astype("category").cat.codes

In [22]:
# matches["round_code"] = matches["round"].astype("category").cat.codes

In [23]:
matches["formation_code"] = matches["formation"].astype("category").cat.codes

In [24]:
matches["age_int"] = (round(matches["age"], 0)).astype("int")

In [25]:
def categorize_wwages(wwages):
    if wwages > 2000000:
        return 1
    elif 800000 < wwages <= 2000000:
        return 2
    else:
        return 3

In [192]:
def categorize_awages(awages):
    if awages > 150000000:
        return 1
    elif 4000000 < awages <= 150000000:
        return 2
    else:
        return 3

In [193]:
matches['wwages_category'] = matches['wwages'].apply(categorize_wwages)

In [194]:
matches['awages_category'] = matches['awages'].apply(categorize_awages)

In [195]:
predictors = ["venue_code", "opp_code", "hour", "day_code", "captain_code", "referee_code", "formation_code", "age_int", "wwages_category", "awages_category"]

In [196]:
X_train,X_test,y_train,y_test = train_test_split(matches[predictors], matches['target'], test_size=0.2, stratify = matches['target'], random_state=42)

# X_train,X_test,y_train,y_test = train_test_split(matches_rolling[predictors], matches_rolling['target'], test_size=0.2, stratify = matches_rolling['target'], random_state=42)

train = matches[matches["date"] < '2023-02-01']
test = matches[matches["date"] > '2023-02-01']

In [197]:
test

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,venue_code,opp_code,hour,day_code,captain_code,referee_code,formation_code,age_int,wwages_category,awages_category
19,2023-02-05,21:00,Matchweek 20,Sun,Home,W,3.0,0.0,Sevilla,2.7,...,1,22,21,6,157,20,16,26,1,1
20,2023-02-12,21:00,Matchweek 21,Sun,Away,W,1.0,0.0,Villarreal,1.7,...,0,25,21,6,110,3,16,26,1,1
21,2023-02-19,21:00,Matchweek 22,Sun,Home,W,2.0,0.0,Cádiz,2.2,...,1,7,21,6,155,25,16,26,1,1
22,2023-02-26,18:30,Matchweek 23,Sun,Away,L,0.0,1.0,Almería,1.4,...,0,1,18,6,157,24,16,26,1,1
23,2023-03-05,16:15,Matchweek 24,Sun,Home,W,1.0,0.0,Valencia,1.8,...,1,23,16,6,157,1,16,26,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,2023-05-14,16:15,Matchweek 34,Sun,Home,W,1.0,0.0,Atlético Madrid,1.4,...,1,3,16,6,46,4,3,28,3,2
756,2023-05-20,18:30,Matchweek 35,Sat,Away,D,1.0,1.0,Getafe,1.1,...,0,11,18,5,46,15,20,28,3,2
757,2023-05-24,19:30,Matchweek 36,Wed,Home,D,1.0,1.0,Sevilla,1.5,...,1,22,19,2,46,13,18,28,3,2
758,2023-05-28,19:00,Matchweek 37,Sun,Away,W,1.0,0.0,Athletic Club,0.9,...,0,2,19,6,46,28,20,28,3,2


In [198]:
# dtree = DecisionTreeClassifier(criterion='gini', max_depth=1, random_state=42)

In [199]:
# start = 0.001
# end = 2.0
# step = 0.001

# result = []
# current_value = start

# while current_value <= end:
#     result.append(current_value)
#     current_value += step

In [204]:
classifiers = [
#     KNeighborsClassifier(2),
#     KNeighborsClassifier(3),
#     KNeighborsClassifier(4),
#     KNeighborsClassifier(5), ne valjaju ovi nikako
#     LogisticRegression(), svi su 61%, nema razlike
#     SVC(kernel="linear", C=0.025, probability=True),
#     SVC(kernel="linear", C=0.05, probability=True),
#     SVC(kernel="linear", C=0.075, probability=True),
#     SVC(gamma=2, C=1, probability=True),
#     SVC(gamma=1, C=1, probability=True),
#     SVC(gamma=2, C=2, probability=True),
#     SVC(gamma=1, C=2, probability=True),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
#     GaussianProcessClassifier(1.0 * RBF(1.1)),
#     GaussianProcessClassifier(1.0 * RBF(1.2)),
#     DecisionTreeClassifier(max_depth=4),
#     DecisionTreeClassifier(max_depth=5),
#     DecisionTreeClassifier(max_depth=6),
#     DecisionTreeClassifier(max_depth=8),
    ## n_estimators = 43 najbolji rez
    ## min_samples_split = 11 najbolji rez
    
#     AdaBoostClassifier(learning_rate=1.544, n_estimators=52, random_state=42) -> 0.6894736842105263
    
#     AdaBoostClassifier(learning_rate=1.53, n_estimators=45, random_state=42) -> 0.6881578947368421

#     AdaBoostClassifier(learning_rate=1.55, n_estimators=52, random_state=42) -> 0.6855263157894737
    
#     AdaBoostClassifier(learning_rate=1.5, n_estimators=46, random_state=42) -> 0.6789473684210526
    
#     RandomForestClassifier(n_estimators=53, min_samples_split=11, random_state=42, max_depth=9) -> 0.6710526315789473
    
#     min_samples_split=14, random_state=42, max_depth=13)
#     min_samples_split=14, random_state=42, max_depth=14)    
    
#     AdaBoostClassifier(learning_rate=1.3164, n_estimators=61, random_state=42) -> 0.7072538860103627
#     train = matches[matches["date"] < '2023-01-15'],    test = matches[matches["date"] > '2023-01-15']

#     AdaBoostClassifier(learning_rate=1.3624, n_estimators=45, random_state=42) -> 0.7046632124352331
#     train = matches[matches["date"] < '2023-01-15'],    test = matches[matches["date"] > '2023-01-15']
    
#     AdaBoostClassifier(learning_rate=1.364, n_estimators=45, random_state=42) -> 0.7020725388601037
#     train = matches[matches["date"] < '2023-01-15'],    test = matches[matches["date"] > '2023-01-15']
      
    AdaBoostClassifier(n_estimators=45, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=46, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=47, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=48, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=49, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=50, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=51, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=52, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=53, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=54, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=55, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=56, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=57, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=58, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=59, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=60, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=61, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=62, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=63, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=64, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=65, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=66, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=67, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=68, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=69, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=70, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=71, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=72, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=73, random_state=42, learning_rate=1.654),
    AdaBoostClassifier(n_estimators=74, random_state=42, learning_rate=1.654) 
    
#     RandomForestClassifier(max_depth=10, n_estimators=15, max_features=1),
#     RandomForestClassifier(max_depth=12, n_estimators=20, max_features=1),
#     MLPClassifier(alpha=1, max_iter=1000),
#     MLPClassifier(alpha=1, max_iter=1100),
#     MLPClassifier(alpha=1, max_iter=1200),
#     AdaBoostClassifier(),
#     GaussianNB(),
#     QuadraticDiscriminantAnalysis()
]

In [205]:
for clf in classifiers:
#         clf.fit(X_train, y_train)
        clf.fit(train[predictors], train["target"])
#         accuracy = clf.score(X_test, y_test)
        accuracy = accuracy_score(test["target"], clf.predict(test[predictors]))
#         mae = mean_absolute_error(test["target"], clf.predict(test[predictors]))
#         mse = mean_squared_error(test["target"], clf.predict(test[predictors]))
#         rmse = mean_squared_error(test["target"], clf.predict(test[predictors]), squared=False)

#         prediction_proba = clf.predict_proba(X_test)
#         logloss = log_loss(y_test,prediction_proba)
#         precision = score(y_test, prediction_proba)
#         conf_martrix = confusion_matrix(y_test, prediction_proba)
#         clas_report = classification_report(y_test, prediction_proba)
        print(clf.n_estimators, accuracy)
#         print("MAE:", mae)
#         print("MSE:", mse)
#         print("RMSE:", rmse)
#         print("")

45 0.680628272251309
46 0.6780104712041884
47 0.680628272251309
48 0.6780104712041884
49 0.6884816753926701
50 0.7041884816753927
51 0.693717277486911
52 0.693717277486911
53 0.6910994764397905
54 0.680628272251309
55 0.680628272251309
56 0.680628272251309
57 0.6701570680628273
58 0.680628272251309
59 0.6910994764397905
60 0.693717277486911
61 0.6884816753926701
62 0.680628272251309
63 0.6727748691099477
64 0.6884816753926701
65 0.6780104712041884
66 0.6858638743455497
67 0.6780104712041884
68 0.675392670157068
69 0.6832460732984293
70 0.675392670157068
71 0.680628272251309
72 0.6780104712041884
73 0.680628272251309
74 0.675392670157068


In [None]:
# 50 0.7041884816753927 1.654

In [152]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=clf.predict(test[predictors])))

In [153]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,206,29
1,92,57


In [154]:
# grouped_matches = matches.groupby("team")

In [1744]:
# def rolling_averages(group, cols, new_cols):
#     group = group.sort_values("date") # sortiraj po datumu
#     rolling_stats = group[cols].rolling(3, closed='left').mean() # uzima mean rezultata od prethodna 3 matchweek-a
#     group[new_cols] = rolling_stats
#     group = group.dropna(subset=new_cols) # uklanja nedostajuce vrijednosti
#     return group

In [1745]:
# cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
# new_cols = [f"{c}_rolling" for c in cols]

# rolling averages uzima prosjecne vrijednosti ovih kolona u protekla 3 matchweek-a i na taj nacin predvidja ishod sljedeceg meca

In [1684]:
# matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [1685]:
# matches_rolling.shape

In [1686]:
# matches_rolling = matches_rolling.droplevel('team')

In [1683]:
# matches_rolling

In [1682]:
# matches_rolling.index = range(matches_rolling.shape[0])

In [1681]:
# matches_rolling