In [1]:
# import required libraries

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score, confusion_matrix, roc_auc_score, classification_report, log_loss

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
matches = pd.read_csv("matches_ligue1_v2.csv", index_col=0)

In [4]:
matches.shape

(3598, 30)

In [5]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,dist,fk,pk,pkatt,season,team,wwages,awages,age
0,2022-08-06,21:00,Ligue 1,Matchweek 1,Sat,Away,W,5.0,0.0,Clermont Foot,...,12.0,12.9,1.0,0.0,0.0,2023,Paris Saint Germain,6953654,361590000,26.9
1,2022-08-13,21:00,Ligue 1,Matchweek 2,Sat,Home,W,5.0,2.0,Montpellier,...,8.0,18.2,3.0,1.0,2.0,2023,Paris Saint Germain,6953654,361590000,26.9
2,2022-08-21,20:45,Ligue 1,Matchweek 3,Sun,Away,W,7.0,1.0,Lille,...,9.0,11.9,0.0,0.0,0.0,2023,Paris Saint Germain,6953654,361590000,26.9
3,2022-08-28,20:45,Ligue 1,Matchweek 4,Sun,Home,D,1.0,1.0,Monaco,...,4.0,18.7,0.0,1.0,1.0,2023,Paris Saint Germain,6953654,361590000,26.9
4,2022-08-31,21:00,Ligue 1,Matchweek 5,Wed,Away,W,3.0,0.0,Toulouse,...,12.0,14.8,2.0,0.0,0.0,2023,Paris Saint Germain,6953654,361590000,26.9


In [6]:
matches["team"].value_counts()

Montpellier            180
Marseille              180
Rennes                 180
Lille                  180
Monaco                 180
Lyon                   180
Angers                 180
Nice                   180
Nantes                 180
Reims                  180
Paris Saint Germain    179
Strasbourg             179
Brest                  142
Saint Etienne          142
Bordeaux               142
Lens                   114
Lorient                114
Toulouse               104
Metz                   104
Nimes                  104
Dijon                  104
Troyes                  76
Clermont Foot           76
Amiens                  66
Auxerre                 38
Ajaccio                 38
Caen                    38
Guingamp                38
Name: team, dtype: int64

In [7]:
matches.isna().sum()

date               0
time               0
comp               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 0
xga                0
poss               0
attendance       780
captain            0
formation          0
referee            0
match report       0
notes           3598
sh                 0
sot                0
dist               1
fk                 0
pk                 0
pkatt              0
season             0
team               0
wwages             0
awages             0
age                0
dtype: int64

In [8]:
matches["round"].value_counts()

Matchweek 1     100
Matchweek 15    100
Matchweek 27    100
Matchweek 26    100
Matchweek 25    100
Matchweek 24    100
Matchweek 23    100
Matchweek 22    100
Matchweek 21    100
Matchweek 2     100
Matchweek 19    100
Matchweek 18    100
Matchweek 17    100
Matchweek 16    100
Matchweek 20    100
Matchweek 14    100
Matchweek 7     100
Matchweek 13    100
Matchweek 4     100
Matchweek 5     100
Matchweek 6     100
Matchweek 3     100
Matchweek 8     100
Matchweek 9     100
Matchweek 10    100
Matchweek 11    100
Matchweek 12    100
Matchweek 28     98
Matchweek 29     80
Matchweek 30     80
Matchweek 31     80
Matchweek 32     80
Matchweek 33     80
Matchweek 34     80
Matchweek 35     80
Matchweek 36     80
Matchweek 37     80
Matchweek 38     80
Name: round, dtype: int64

In [9]:
del matches["comp"]
del matches["notes"]

In [10]:
matches.loc[matches.attendance.isna()==True, 'attendance'] = round(matches.attendance.mean(), 0)

In [11]:
matches.loc[matches.dist.isna()==True, 'dist'] = round(matches.dist.mean(), 0)

In [12]:
matches.loc[matches.wwages.isna()==True, 'wwages'] = round(matches.wwages.mean(), 0)

In [13]:
matches.loc[matches.awages.isna()==True, 'awages'] = round(matches.awages.mean(), 0)

In [14]:
matches["target"] = (matches["result"] == "W").astype("int")

In [15]:
matches["date"] = pd.to_datetime(matches["date"])

In [16]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [17]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [18]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [19]:
matches["day_code"] = matches["date"].dt.dayofweek

In [20]:
matches["captain_code"] = matches["captain"].astype("category").cat.codes
matches["referee_code"] = matches["referee"].astype("category").cat.codes

In [21]:
# matches["comp_code"] = matches["comp"].astype("category").cat.codes

In [22]:
# matches["round_code"] = matches["round"].astype("category").cat.codes

In [23]:
matches["formation_code"] = matches["formation"].astype("category").cat.codes

In [24]:
matches["age_int"] = (round(matches["age"], 0)).astype("int")

In [25]:
def categorize_wwages(wwages):
    if wwages > 1400000:
        return 1
    elif 500000 < wwages <= 1400000:
        return 2
    else:
        return 3

In [26]:
def categorize_awages(awages):
    if awages > 100000000:
        return 1
    elif 20000000 < awages <= 100000000:
        return 2
    else:
        return 3

In [27]:
matches['wwages_category'] = matches['wwages'].apply(categorize_wwages)

In [28]:
matches['awages_category'] = matches['awages'].apply(categorize_awages)

In [29]:
predictors = ["venue_code", "opp_code", "hour", "day_code", "captain_code", "referee_code", "formation_code", "age_int", "wwages_category", "awages_category"]

In [30]:
X_train,X_test,y_train,y_test = train_test_split(matches[predictors], matches['target'], test_size=0.2, stratify = matches['target'], random_state=42)

# X_train,X_test,y_train,y_test = train_test_split(matches_rolling[predictors], matches_rolling['target'], test_size=0.2, stratify = matches_rolling['target'], random_state=42)

train = matches[matches["date"] < '2023-01-24']
test = matches[matches["date"] > '2023-01-24']

In [31]:
test

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,venue_code,opp_code,hour,day_code,captain_code,referee_code,formation_code,age_int,wwages_category,awages_category
19,2023-01-29,20:45,Matchweek 20,Sun,Home,D,1.0,1.0,Reims,1.6,...,1,22,20,6,102,31,12,27,1,1
20,2023-02-01,21:00,Matchweek 21,Wed,Away,W,3.0,1.0,Montpellier,4.4,...,0,17,21,2,102,18,9,27,1,1
21,2023-02-04,17:00,Matchweek 22,Sat,Home,W,2.0,1.0,Toulouse,1.9,...,1,26,17,5,102,5,8,27,1,1
22,2023-02-11,17:00,Matchweek 23,Sat,Away,L,1.0,3.0,Monaco,0.8,...,0,16,17,5,102,8,7,27,1,1
23,2023-02-19,13:00,Matchweek 24,Sun,Home,W,4.0,3.0,Lille,2.0,...,1,11,13,6,129,35,13,27,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,2023-05-07,13:00,Matchweek 34,Sun,Home,L,1.0,2.0,Monaco,1.0,...,1,16,13,6,119,16,19,26,3,3
756,2023-05-14,20:45,Matchweek 35,Sun,Away,L,1.0,3.0,Marseille,0.3,...,0,14,20,6,119,7,19,26,3,3
757,2023-05-21,15:00,Matchweek 36,Sun,Away,D,2.0,2.0,Reims,1.2,...,0,22,15,6,57,10,19,26,3,3
758,2023-05-27,21:00,Matchweek 37,Sat,Home,W,2.0,1.0,Troyes,1.2,...,1,27,21,5,57,21,19,26,3,3


In [32]:
# dtree = DecisionTreeClassifier(criterion='gini', max_depth=1, random_state=42)

In [33]:
# start = 0.4
# end = 0.45
# step = 0.001

# result = []
# current_value = start

# while current_value <= end:
#     result.append(current_value)
#     current_value += step

In [136]:
classifiers = [
#     KNeighborsClassifier(2),
#     KNeighborsClassifier(3),
#     KNeighborsClassifier(4),
#     KNeighborsClassifier(5), ne valjaju ovi nikako
#     LogisticRegression(), svi su 61%, nema razlike
#     SVC(kernel="linear", C=0.025, probability=True),
#     SVC(kernel="linear", C=0.05, probability=True),
#     SVC(kernel="linear", C=0.075, probability=True),
#     SVC(gamma=2, C=1, probability=True),
#     SVC(gamma=1, C=1, probability=True),
#     SVC(gamma=2, C=2, probability=True),
#     SVC(gamma=1, C=2, probability=True),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
#     GaussianProcessClassifier(1.0 * RBF(1.1)),
#     GaussianProcessClassifier(1.0 * RBF(1.2)),
#     DecisionTreeClassifier(max_depth=4),
#     DecisionTreeClassifier(max_depth=5),
#     DecisionTreeClassifier(max_depth=6),
#     DecisionTreeClassifier(max_depth=8),
    ## n_estimators = 43 najbolji rez
    ## min_samples_split = 11 najbolji rez
    
#     AdaBoostClassifier(learning_rate=1.544, n_estimators=52, random_state=42) -> 0.6894736842105263
    
#     AdaBoostClassifier(learning_rate=1.53, n_estimators=45, random_state=42) -> 0.6881578947368421

#     AdaBoostClassifier(learning_rate=1.55, n_estimators=52, random_state=42) -> 0.6855263157894737
    
#     AdaBoostClassifier(learning_rate=1.5, n_estimators=46, random_state=42) -> 0.6789473684210526
    
#     RandomForestClassifier(n_estimators=53, min_samples_split=11, random_state=42, max_depth=9) -> 0.6710526315789473
    
#     min_samples_split=14, random_state=42, max_depth=13)
#     min_samples_split=14, random_state=42, max_depth=14)    
    
#     AdaBoostClassifier(learning_rate=1.3164, n_estimators=61, random_state=42) -> 0.7072538860103627
#     train = matches[matches["date"] < '2023-01-15'],    test = matches[matches["date"] > '2023-01-15']

#     AdaBoostClassifier(learning_rate=1.3624, n_estimators=45, random_state=42) -> 0.7046632124352331
#     train = matches[matches["date"] < '2023-01-15'],    test = matches[matches["date"] > '2023-01-15']
    
#     AdaBoostClassifier(learning_rate=1.364, n_estimators=45, random_state=42) -> 0.7020725388601037
#     train = matches[matches["date"] < '2023-01-15'],    test = matches[matches["date"] > '2023-01-15']
      
    AdaBoostClassifier(n_estimators=75, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=76, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=77, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=78, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=79, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=80, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=81, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=82, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=83, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=84, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=55, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=56, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=57, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=58, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=59, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=60, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=61, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=62, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=63, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=64, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=65, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=66, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=67, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=68, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=69, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=70, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=71, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=72, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=73, random_state=42, learning_rate=1.913),
    AdaBoostClassifier(n_estimators=74, random_state=42, learning_rate=1.913)    
    
#     RandomForestClassifier(max_depth=10, n_estimators=15, max_features=1),
#     RandomForestClassifier(max_depth=12, n_estimators=20, max_features=1),
#     MLPClassifier(alpha=1, max_iter=1000),
#     MLPClassifier(alpha=1, max_iter=1100),
#     MLPClassifier(alpha=1, max_iter=1200),
#     AdaBoostClassifier(),
#     GaussianNB(),
#     QuadraticDiscriminantAnalysis()
]

In [137]:
for clf in classifiers:
#         clf.fit(X_train, y_train)
        clf.fit(train[predictors], train["target"])
#         accuracy = clf.score(X_test, y_test)
        accuracy = accuracy_score(test["target"], clf.predict(test[predictors]))
#         mae = mean_absolute_error(test["target"], clf.predict(test[predictors]))
#         mse = mean_squared_error(test["target"], clf.predict(test[predictors]))
#         rmse = mean_squared_error(test["target"], clf.predict(test[predictors]), squared=False)

#         prediction_proba = clf.predict_proba(X_test)
#         logloss = log_loss(y_test,prediction_proba)
#         precision = score(y_test, prediction_proba)
#         conf_martrix = confusion_matrix(y_test, prediction_proba)
#         clas_report = classification_report(y_test, prediction_proba)
        print(clf.n_estimators, accuracy)
#         print("MAE:", mae)
#         print("MSE:", mse)
#         print("RMSE:", rmse)
#         print("")

75 0.6763157894736842
76 0.6631578947368421
77 0.6605263157894737
78 0.6552631578947369
79 0.6394736842105263
80 0.6631578947368421
81 0.6526315789473685
82 0.6684210526315789
83 0.65
84 0.6736842105263158
55 0.6263157894736842
56 0.6394736842105263
57 0.6263157894736842
58 0.6289473684210526
59 0.6263157894736842
60 0.6289473684210526
61 0.65
62 0.6447368421052632
63 0.6578947368421053
64 0.6526315789473685
65 0.6657894736842105
66 0.6578947368421053
67 0.6789473684210526
68 0.6684210526315789
69 0.6763157894736842
70 0.6684210526315789
71 0.6736842105263158
72 0.6657894736842105
73 0.6763157894736842
74 0.6605263157894737


In [121]:
# 67 0.6789473684210526 1.913

In [36]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=clf.predict(test[predictors])))

In [75]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,209,33
1,90,50


In [76]:
# grouped_matches = matches.groupby("team")

In [1744]:
# def rolling_averages(group, cols, new_cols):
#     group = group.sort_values("date") # sortiraj po datumu
#     rolling_stats = group[cols].rolling(3, closed='left').mean() # uzima mean rezultata od prethodna 3 matchweek-a
#     group[new_cols] = rolling_stats
#     group = group.dropna(subset=new_cols) # uklanja nedostajuce vrijednosti
#     return group

In [1745]:
# cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
# new_cols = [f"{c}_rolling" for c in cols]

# rolling averages uzima prosjecne vrijednosti ovih kolona u protekla 3 matchweek-a i na taj nacin predvidja ishod sljedeceg meca

In [1684]:
# matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [1685]:
# matches_rolling.shape

In [1686]:
# matches_rolling = matches_rolling.droplevel('team')

In [1683]:
# matches_rolling

In [1682]:
# matches_rolling.index = range(matches_rolling.shape[0])

In [1681]:
# matches_rolling