In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB


data = pd.read_csv('matches_serie_A.csv')
#removing unnecessary columns
clean_data=data.drop(['Notes', 'Match Report','Comp','Captain','Referee','Round'], axis=1)
clean_data.isnull().sum()

Unnamed: 0         0
Date               0
Time               0
Day                0
Venue              0
Result             0
GF                 0
GA                 0
Opponent           0
xG                 2
xGA                2
Poss               0
Attendance       684
Formation          0
Opp Formation      0
Sh                 0
SoT                0
Dist               4
FK                 2
PK                 0
PKatt              0
Season             0
Team               0
dtype: int64

In [2]:
# Fill Attendance
home_avg_attendance = clean_data[clean_data['Venue'] == 'Home'].groupby('Team')['Attendance'].mean()

# Filling missing attendances using home team avg attendacne
def fill_attendance(row):
    if pd.isna(row['Attendance']):
        if row['Venue'] == 'Home':
            return home_avg_attendance.get(row['Team'], clean_data['Attendance'].mean())
        else:
            return home_avg_attendance.get(row['Opponent'], clean_data['Attendance'].mean())
    return row['Attendance']

clean_data['Attendance'] = clean_data.apply(fill_attendance, axis=1)

# Fill missing numeric stats with medians
for col in ['xG', 'xGA', 'Dist', 'FK']:
    clean_data[col] = clean_data[col].fillna(clean_data[col].median()) 

# Encode Result from string to ints (our prediction feature)
clean_data['Result'] = clean_data['Result'].map({'W': 2, 'D': 1, 'L': 0})

# Date processing
clean_data['Date'] = pd.to_datetime(clean_data['Date'], format='mixed', dayfirst=True)
clean_data = clean_data.sort_values(['Team', 'Date']).reset_index(drop=True)
clean_data['Month'] = clean_data['Date'].dt.month

#season stage encoding
def season_stage(month): 
    if month in [8, 9, 10, 11]: return 0
    elif month in [12, 1, 2]: return 1
    else: return 2
clean_data['SeasonStage'] = clean_data['Month'].apply(season_stage)

# Venue encoding from strings to int
clean_data['Venue'] = clean_data['Venue'].map({'Home': 1, 'Away': 0})

# Time encoding into int
clean_data['Hour'] = pd.to_datetime(clean_data['Time'], format="%H:%M").dt.hour
clean_data['TimeBinary'] = ((clean_data['Hour'] >= 18) | (clean_data['Hour'] < 6)).astype(int)
clean_data.drop(['Time', 'Hour'], axis=1, inplace=True)

# Day encoding from string to int
day_mapping = {"Mon": 0, "Tue": 1, "Wed": 2, "Thu": 3, "Fri": 4, "Sat": 5, "Sun": 6}
clean_data['Day'] = clean_data['Day'].map(day_mapping)

# Formation encoding for a uniform formation
def clean_and_split_formation(formation):
    if pd.isna(formation): return [0, 0, 0, 0, 0]
    formation = re.sub(r"[^0-9\-]", "", str(formation))
    parts = [int(x) for x in formation.split('-') if x.isdigit()]
    while len(parts) < 5:
        parts.append(0)
    return parts[:5]

for col in ['Formation', 'Opp Formation']:
    split_data = clean_data[col].apply(clean_and_split_formation)
    clean_data[[f"{col}_{i}" for i in range(1,6)]] = pd.DataFrame(split_data.tolist(), index=clean_data.index)
clean_data.drop(['Formation', 'Opp Formation'], axis=1, inplace=True)


In [3]:
# Feature engineering 
stats_to_avg = ['xG', 'xGA', 'SoT', 'Sh', 'Poss', 'Dist', 'GF', 'GA'] # Average the relevant stats

# Home/Away averages
def calc_avg(df, stat, venue): # Group results
    return (
        df[df['Venue'] == venue]
        .groupby('Team')[stat]
        .transform(lambda x: x.shift(1).expanding().mean())
    )

for stat in stats_to_avg:
    clean_data[f'Home_{stat}_avg'] = calc_avg(clean_data, stat, 1)
    clean_data[f'Away_{stat}_avg'] = calc_avg(clean_data, stat, 0)

# Recent form 
clean_data['WinRate_last5'] = ( # Rolling average win statistics for past 5 matches
    clean_data.groupby('Team')['Result']
    .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean() / 2)
).fillna(0)

# Computes average statistics for past 5 matches in the 4 most important goals related stats
for stat in ['GF', 'GA', 'xG', 'xGA']: 
    clean_data[f'{stat}_last5'] = (
        clean_data.groupby('Team')[stat]
        .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())
    )

# Win, draw , lose streaks
def get_streak(series, value):
    streaks = []
    current = 0
    for res in series.shift(1):
        if res == value:
            current += 1
        else:
            current = 0
        streaks.append(current)
    return streaks

clean_data['Win_Streak'] = clean_data.groupby('Team')['Result'].transform(lambda x: get_streak(x, 2)) # New feature - current win streak
clean_data['Draw_Streak'] = clean_data.groupby('Team')['Result'].transform(lambda x: get_streak(x, 1)) # New feature - current draw streak
clean_data['Loss_Streak'] = clean_data.groupby('Team')['Result'].transform(lambda x: get_streak(x, 0)) # New feature - current lose streak

# Rest & fatigue days 
clean_data['RestDays'] = clean_data.groupby('Team')['Date'].diff().dt.days.fillna(7).clip(0, 14) # New feature - days passed from last match up to 14 days

games_last_10 = [] # New feature - games played in last 10 days for each team
for team, group in clean_data.groupby('Team'):
    g = group.copy()
    g = g.set_index('Date').sort_index()
    g['Games_last10d'] = g['Team'].rolling('10D').count().shift(1)
    games_last_10.append(g['Games_last10d'].reset_index(drop=True))
clean_data['Games_last10d'] = pd.concat(games_last_10, ignore_index=True).fillna(0)

# Elo rating 
def compute_elo(df, k=150, base=1500): # Assigns Elo rating for each team based on pre match data  , base elo is 1500
    ratings, pre = {}, []
    for _, row in df.iterrows():
        t, o = row['Team'], row['Opponent']
        ratings.setdefault(t, base)
        ratings.setdefault(o, base)
        pre.append(ratings[t])
        exp = 1 / (1 + 10 ** ((ratings[o] - ratings[t]) / 400)) # standard elo formula (from chess)
        ratings[t] += k * ((row['Result'] / 2) - exp)
    return pre

clean_data['Elo'] = compute_elo(clean_data, k=150)
last_elo = clean_data.groupby('Team')['Elo'].shift(1).fillna(1500)
opp_elo = clean_data.merge(
    clean_data[['Team', 'Date', 'Elo']],
    left_on=['Opponent', 'Date'],
    right_on=['Team', 'Date'],
    how='left', suffixes=('', '_opp')
)['Elo_opp'].fillna(1500)

clean_data['Elo_Diff'] = last_elo - opp_elo # Elo difference between teams
clean_data['OppElo_last5'] = clean_data.groupby('Team')['Elo_Diff'].transform(
    lambda x: x.shift(1).rolling(5, min_periods=1).mean()
).fillna(0)

# Opponent form & fatigue
def opp_feature(col, fill):
    return clean_data.merge(
        clean_data[['Team', 'Date', col]],
        left_on=['Opponent', 'Date'],
        right_on=['Team', 'Date'],
        how='left', suffixes=('', '_opp')
    )[col].fillna(fill)

clean_data['Opp_WinRate_last5'] = opp_feature('WinRate_last5', 0) #New feature - opponent win rate in last 5 matches
clean_data['Opp_RestDays'] = opp_feature('RestDays', 7) #new feature - opponent rest days
clean_data['Opp_Games_last10d'] = opp_feature('Games_last10d', 0) #New feature - opponent games played in last 10 days

# Last 30 days rolling avg 
def last_month(df, stat):
    out = []
    for team, grp in df.groupby('Team'):
        g = grp.set_index('Date')[stat].sort_index()
        out.append(g.rolling('30D', min_periods=1).mean().shift(1).reset_index(drop=True))
    return pd.concat(out).reset_index(drop=True)

for stat in stats_to_avg:
    clean_data[f'{stat}_last30'] = last_month(clean_data, stat)

# Head to head average
def head_to_head(df, stat):
    out = []
    for (t, o), grp in df.groupby(['Team', 'Opponent']):
        out.append(grp[stat].shift(1).expanding().mean().reset_index(drop=True))
    return pd.concat(out).reset_index(drop=True)

for stat in stats_to_avg:
    clean_data[f'head_two_head_{stat}_avg'] = head_to_head(clean_data, stat)

# Home vs away diff
for stat in ['xG', 'GA']:
    clean_data[f'Home_{stat}_last5'] = clean_data.groupby(['Team', 'Venue'])[stat].transform(
        lambda x: x.shift(1).rolling(5, min_periods=1).mean()
    )
    diff = clean_data.groupby('Team')[f'Home_{stat}_last5'].transform('mean') - \
           clean_data.groupby('Team')[stat].transform('mean')
    clean_data[f'{stat}_HomeAwayDiff'] = diff.fillna(0)


clean_data['xG_Momentum'] = clean_data['xG_last5'] - clean_data['xG_last5'].shift(5).fillna(0) #new feature - xG momentum
clean_data['Poss_Momentum'] = clean_data['Poss_last30'] - clean_data['Poss_last30'].shift(5).fillna(0) #new feature - Possession momentum
clean_data['SoT_Momentum'] = clean_data['SoT_last30'] - clean_data['SoT_last30'].shift(5).fillna(0) #new feature - Shots on target momentum
clean_data['GoalDiff_last5'] = clean_data['GF_last5'] - clean_data['GA_last5'] #new feature - goal difference in last 5 matches
clean_data['Goal_Scoring_Rate_5'] = clean_data['GF_last5'] / 5 #new feature - goal scoring rate in last 5 matches
clean_data['Goal_Scoring_Rate_30'] = clean_data['GF_last30'] / 30 #new feature - goal scoring rate in last 30 matches

clean_data['TotalGoals'] = clean_data['GF'] + clean_data['GA'] #new feature - total goals scored and conceded
clean_data['Goals_last5'] = clean_data['GF_last5'] + clean_data['GA_last5'] #New feature - total goals scored and conceded in last 5 matches
clean_data['Attacking_Trend'] = clean_data['xG_last5'] - clean_data['xG_last5'].shift(5).fillna(0) #new feature - attacking trend in last 5 matches
clean_data['Defensive_Weakness'] = clean_data['GA_last5'] - clean_data['GA_last5'].shift(5).fillna(0) #new feature - defensive weakness in last 5 matches
clean_data['Opp_DefWeakness'] = opp_feature('GA_last5', clean_data['GA_last5'].mean()) #new feature - opponent defensive weakness in last 5 matches

fill_featrues = [
    '_avg', '_last5', '_last30', 'Streak',
    'Elo', 'Rest', 'Diff', 'Opp', 'Trend', 'Goals'
]
#filling null values
for c in clean_data.columns:
    if clean_data[c].dtype in ['float64', 'int64'] and any(k in c for k in fill_featrues):
        clean_data[c] = clean_data[c].fillna(clean_data[c].mean())

# Drop raw stats and Date, not needed anymore
clean_data.drop(columns=stats_to_avg + ['Date'], inplace=True, errors='ignore')


In [4]:
# feature selection
clean_data = pd.get_dummies(clean_data, columns=['Team','Opponent'], drop_first=True).dropna().reset_index(drop=True)
#droping the target column
X = clean_data.drop('Result', axis=1)
y = clean_data['Result']

# Feature importance just to pick top 20 features, using random forest
rf_fs = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf_fs.fit(X, y)
importances = rf_fs.feature_importances_
idx = np.argsort(importances)[::-1][:20]
top_features = X.columns[idx].tolist()
print("Selected Top 20 Features:", top_features)


Selected Top 20 Features: ['TotalGoals', 'Elo_Diff', 'Attendance', 'Elo', 'Poss_last30', 'OppElo_last5', 'Home_xG_last5', 'xG_last30', 'Dist_last30', 'Poss_Momentum', 'xG_last5', 'SoT_Momentum', 'Sh_last30', 'xGA_last30', 'head_two_head_Poss_avg', 'xGA_last5', 'head_two_head_Sh_avg', 'head_two_head_xGA_avg', 'head_two_head_Dist_avg', 'xG_Momentum']


In [5]:

# Train test split using the top features
features_to_use = X[top_features]
X_train, X_test, y_train, y_test = train_test_split(
    features_to_use, y, test_size=0.2, stratify=y, random_state=42
)

# Standardize numeric features
num_cols = X_train.select_dtypes(include=[float, int]).columns
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Models we chosed
models = {
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42),
    'NaiveBayes': GaussianNB(),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

# Cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("\nCross-Validation (Training Set)")
for name, model in models.items():
    f1_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1_weighted', n_jobs=-1)
    acc_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    print(f"{name:20s} | F1: {f1_scores.mean():.3f} ± {f1_scores.std():.3f} | "
          f"Acc: {acc_scores.mean():.3f} ± {acc_scores.std():.3f}")

# Hyperparameter tuning
param_grids = {
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5]
    }

}

best_models = {}

for name in ['SVM', 'GradientBoosting']:
    grid = GridSearchCV(models[name], param_grids[name], cv=3, scoring='f1_weighted', n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"{name} Best Params: {grid.best_params_}")

best_models['NaiveBayes'] = models['NaiveBayes'].fit(X_train, y_train)

# Final test set evaluation
print("\nTest Set Performance ")
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n--- {name} ---")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
    print("Precision:", round(precision_score(y_test, y_pred, average='weighted'), 3))
    print("Recall:", round(recall_score(y_test, y_pred, average='weighted'), 3))
    print("F1 Score:", round(f1_score(y_test, y_pred, average='weighted'), 3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=3))




Cross-Validation (Training Set)
SVM                  | F1: 0.506 ± 0.018 | Acc: 0.507 ± 0.018
NaiveBayes           | F1: 0.456 ± 0.016 | Acc: 0.493 ± 0.015
GradientBoosting     | F1: 0.609 ± 0.017 | Acc: 0.611 ± 0.016
Fitting 3 folds for each of 12 candidates, totalling 36 fits
SVM Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits
GradientBoosting Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Test Set Performance 

--- SVM ---
Accuracy: 0.51
Precision: 0.508
Recall: 0.51
F1 Score: 0.509
Confusion Matrix:
 [[152  57  65]
 [ 63  80  62]
 [ 67  56 153]]
Classification Report:
               precision    recall  f1-score   support

           0      0.539     0.555     0.547       274
           1      0.415     0.390     0.402       205
           2      0.546     0.554     0.550       276

    accuracy                          0.510       755
   macro avg      0.500     0.500     0.500       