In [5]:
!mkdir -p data

!curl -L -o data/serie-a-matches-dataset-2020-2025.zip\
  https://www.kaggle.com/api/v1/datasets/download/marcelbiezunski/serie-a-matches-dataset-2020-2025

!unzip -o data/serie-a-matches-dataset-2020-2025.zip -d data
!rm data/serie-a-matches-dataset-2020-2025.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  124k  100  124k    0     0   177k      0 --:--:-- --:--:-- --:--:--  467k
Archive:  data/serie-a-matches-dataset-2020-2025.zip
  inflating: data//matches_seriea.csv  


In [None]:
!pip3 install pandas matplotlib seaborn jupyterlab numpy scikit-learn scipy

## Data Exploration

In [1]:
import pandas as pd
dataset_path = 'data/matches_seriea.csv'

matches = pd.read_csv(dataset_path, index_col=0)
matches.dtypes

date                 str
time                 str
comp                 str
round                str
day                  str
venue                str
result               str
gf               float64
ga               float64
opponent             str
xg               float64
xga              float64
poss             float64
attendance       float64
captain              str
formation            str
opp formation        str
referee              str
match report         str
notes                str
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team                 str
dtype: object

In [2]:
pd.set_option('display.max_columns', None)
matches.tail()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,opp formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
36,2021-05-03,20:45,Serie A,Matchweek 34,Mon,Away,L,0.0,1.0,Torino,0.5,1.6,49.0,,Bruno Alves,4-3-1-2,3-1-4-2,Gianluca Aureliano,Match Report,,10.0,1.0,18.1,0.0,0,0,2020,Parma
37,2021-05-09,15:00,Serie A,Matchweek 35,Sun,Home,L,2.0,5.0,Atalanta,0.9,3.3,44.0,,Bruno Alves,4-1-3-2,3-4-3,Antonio Giua,Match Report,,7.0,2.0,18.7,0.0,0,0,2020,Parma
38,2021-05-12,20:45,Serie A,Matchweek 36,Wed,Away,L,0.0,1.0,Lazio,0.7,1.5,49.0,,Riccardo Gagliolo,3-5-2,3-5-2,Federico Dionisi,Match Report,,12.0,4.0,27.1,1.0,0,0,2020,Parma
39,2021-05-16,18:00,Serie A,Matchweek 37,Sun,Home,L,1.0,3.0,Sassuolo,1.0,2.0,34.0,,Bruno Alves,3-5-2,4-2-3-1,Marco Piccinini,Match Report,,16.0,8.0,16.8,0.0,0,0,2020,Parma
40,2021-05-22,20:45,Serie A,Matchweek 38,Sat,Away,L,0.0,3.0,Sampdoria,0.8,0.9,55.0,,Hernani,3-5-2,5-3-2,Daniele Paterna,Match Report,,12.0,1.0,23.6,2.0,0,0,2020,Parma


### Features list

In [3]:
matches.columns.tolist()

['date',
 'time',
 'comp',
 'round',
 'day',
 'venue',
 'result',
 'gf',
 'ga',
 'opponent',
 'xg',
 'xga',
 'poss',
 'attendance',
 'captain',
 'formation',
 'opp formation',
 'referee',
 'match report',
 'notes',
 'sh',
 'sot',
 'dist',
 'fk',
 'pk',
 'pkatt',
 'season',
 'team']

### Checking for nulls

In [4]:
matches.isnull().sum()

date                0
time                0
comp                0
round               0
day                 0
venue               0
result              0
gf                  0
ga                  0
opponent            0
xg                  2
xga                 2
poss                0
attendance        688
captain             0
formation           0
opp formation       0
referee             0
match report        0
notes            3898
sh                  0
sot                 0
dist                4
fk                  2
pk                  0
pkatt               0
season              0
team                0
dtype: int64

### Concatenate home and away rows for the same games

Identified that there is a difference in `team` and `opponent` columns for **Inter**.  
* In `team` it is stored as: **'Internazionale'**  
* In `opponent` as: **'Inter'**

In [5]:
unique_teams = set(matches['team'].unique())
unique_opponents = set(matches['opponent'].unique())

in_team_not_opponent = unique_teams - unique_opponents
in_opponent_not_team = unique_opponents - unique_teams

print("Teams in 'team' but not 'opponent':", in_team_not_opponent)
print("Teams in 'opponent' but not 'team':", in_opponent_not_team)

Teams in 'team' but not 'opponent': {'Internazionale'}
Teams in 'opponent' but not 'team': {'Inter'}


In [6]:
# Clean 'round' column
matches['round'] = matches['round'].str.replace('Matchweek ', '', regex=False)
matches['round'] = pd.to_numeric(matches['round'], errors='coerce').fillna(0).astype(int)

# Fix team name inconsistencies
# 'Inter' in opponent column should be 'Internazionale' to match 'team' column
match_mapping = {'Inter': 'Internazionale'}
matches['opponent'] = matches['opponent'].replace(match_mapping)

# Create Home and Away dataframes
home_matches = matches[matches['venue'] == 'Home'].copy()
away_matches = matches[matches['venue'] == 'Away'].copy()

# Columns to map from the opponent (away team) perspective
cols_to_merge = ['captain', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
rename_map = {col: f'opp {col}' for col in cols_to_merge}

# Prepare away_matches for merge: select join keys + columns to transfer
# We merge on date and the teams swapping roles
# Note: In away_matches, 'team' is the away team, 'opponent' is the home team
away_subset = away_matches[['date', 'team', 'opponent'] + cols_to_merge].rename(columns=rename_map)

# Merge
# home_matches['team'] matches away_subset['opponent'] (Home Team)
# home_matches['opponent'] matches away_subset['team'] (Away Team)
matches_combined = home_matches.merge(
    away_subset,
    left_on=['date', 'team', 'opponent'],
    right_on=['date', 'opponent', 'team'],
    suffixes=('', '_away')
)

# Drop redundant columns from the merge keys of the right dataframe
matches_combined = matches_combined.drop(columns=['team_away', 'opponent_away'])

# Reset index
matches_combined = matches_combined.reset_index(drop=True)

# Verify counts again
print(f"Total rows: {len(matches_combined)}")
print("Games per round per season:")
print(matches_combined[['season', 'round']].value_counts().sort_index().head(20))

matches_combined.head()

Total rows: 1951
Games per round per season:
season  round
2020    1        10
        2        10
        3        10
        4        10
        5        10
        6        10
        7        10
        8        10
        9        10
        10       10
        11       10
        12       10
        13       10
        14       10
        15       10
        16       10
        17       10
        18       10
        19       10
        20       10
Name: count, dtype: int64


Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,opp formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team,opp captain,opp sh,opp sot,opp dist,opp fk,opp pk,opp pkatt
0,2025-08-23,20:45,Serie A,1,Sat,Home,L,1.0,2.0,Cremonese,1.8,0.2,64.0,75011.0,Mike Maignan,3-5-2,3-5-2,Giuseppe Collu,Match Report,,24.0,5.0,15.5,0.0,0,0,2025,Milan,Matteo Bianchetti,4.0,3.0,13.6,0.0,0,0
1,2025-09-14,20:45,Serie A,3,Sun,Home,W,1.0,0.0,Bologna,1.0,0.2,40.0,69593.0,Mike Maignan,3-4-3,4-3-3,Matteo Marcenaro,Match Report,,14.0,3.0,16.0,0.0,0,0,2025,Milan,Lewis Ferguson,5.0,0.0,20.4,0.0,0,0
2,2025-09-28,20:45,Serie A,5,Sun,Home,W,2.0,1.0,Napoli,0.7,2.5,37.0,73754.0,Mike Maignan,3-5-2,4-1-4-1,Daniele Chiffi,Match Report,,6.0,3.0,14.1,0.0,0,0,2025,Milan,Giovanni Di Lorenzo,18.0,6.0,18.0,0.0,1,1
3,2025-08-30,20:45,Serie A,2,Sat,Home,W,1.0,0.0,Cagliari,1.5,0.5,68.0,50497.0,Giovanni Di Lorenzo,4-1-4-1,3-5-1-1,Kevin Bonacina,Match Report,,20.0,6.0,15.4,0.0,0,0,2025,Napoli,Alessandro Deiola,10.0,1.0,26.8,1.0,0,0
4,2025-09-22,20:45,Serie A,4,Mon,Home,W,3.0,2.0,Pisa,1.1,2.4,63.0,50312.0,Giovanni Di Lorenzo,4-1-4-1,3-5-2,Valerio Crezzini,Match Report,,18.0,6.0,16.5,0.0,0,0,2025,Napoli,Antonio Caracciolo,14.0,5.0,16.2,0.0,1,1


#### Sort matches from most recent to olderst

In [7]:
print(matches.shape)
print(matches_combined.shape)
matches_combined[['date', 'team', 'opponent']].head()
matches_sorted = matches_combined.sort_values(by=['date', 'time'], ascending=False).drop_duplicates()
print('10 Latest matches in serie A in dataset:')
matches_sorted[['date', 'time', 'team', 'opponent', 'gf', 'ga']].head(10)

(3902, 28)
(1951, 35)
10 Latest matches in serie A in dataset:


Unnamed: 0,date,time,team,opponent,gf,ga
46,2025-09-29,20:45,Genoa,Lazio,0.0,3.0
34,2025-09-29,18:30,Parma,Torino,2.0,1.0
2,2025-09-28,20:45,Milan,Napoli,2.0,1.0
49,2025-09-28,18:00,Lecce,Bologna,2.0,2.0
7,2025-09-28,15:00,Roma,Hellas Verona,2.0,0.0
43,2025-09-28,15:00,Pisa,Fiorentina,0.0,0.0
32,2025-09-28,12:30,Sassuolo,Udinese,3.0,1.0
23,2025-09-27,20:45,Cagliari,Internazionale,0.0,2.0
10,2025-09-27,18:00,Juventus,Atalanta,1.0,1.0
20,2025-09-27,15:00,Como,Cremonese,1.0,1.0


#### Convert float columns to int

In [8]:
cols_to_int = [
    'gf', 'ga', 'attendance', 'sh', 'sot', 'fk', 'pk', 'pkatt', 
    'opp sh', 'opp sot', 'opp fk', 'opp pk', 'opp pkatt'
]

matches_sorted[cols_to_int] = matches_sorted[cols_to_int].fillna(0).astype(int)

matches_sorted[cols_to_int].dtypes

gf            int64
ga            int64
attendance    int64
sh            int64
sot           int64
fk            int64
pk            int64
pkatt         int64
opp sh        int64
opp sot       int64
opp fk        int64
opp pk        int64
opp pkatt     int64
dtype: object

## Save preprocessed data to a Parquet file

In [9]:
!pip3 install pyarrow

Collecting pyarrow
  Downloading pyarrow-23.0.0-cp311-cp311-macosx_12_0_x86_64.whl (35.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.8/35.8 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-23.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
matches_sorted.to_parquet('data/serie_a_matches_processed.parquet', index=False)

# Season tables