# Serie A - Part 2: Matchdays & Standings

Now that we have all the match data, let's:
1. Calculate matchday numbers
2. Compute standings at each matchday
3. Enable historical comparisons

**This builds on the database created in Part 1.**

## Step 1: Import and Load Data

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("âœ“ Libraries loaded!")

In [None]:
# Connect to database
DB_PATH = "data/serie_a.db"
engine = create_engine(f"sqlite:///{DB_PATH}")

# Load matches
matches = pd.read_sql("SELECT * FROM matches", engine)
matches['date'] = pd.to_datetime(matches['date'])

print(f"Loaded {len(matches)} matches from database")
print(f"Seasons: {sorted(matches['season_label'].unique())}")

## Step 2: Calculate Matchday Numbers

Since the data doesn't include matchday numbers, we need to calculate them based on dates.

**Logic:** Group matches by season, sort by date, and number the "rounds" (typically 10 matches per matchday in Serie A).

In [None]:
def calculate_matchday(season_df):
    """
    Calculate matchday numbers for a single season.
    Groups matches that occur within 3 days of each other as the same matchday.
    """
    # Sort by date
    season_df = season_df.sort_values('date').copy()
    
    # Calculate day differences
    season_df['date_diff'] = season_df['date'].diff().dt.days
    
    # New matchday when gap > 3 days
    season_df['new_matchday'] = (season_df['date_diff'] > 3) | (season_df['date_diff'].isna())
    season_df['matchday'] = season_df['new_matchday'].cumsum()
    
    return season_df[['matchday']]

# Apply to each season
print("Calculating matchdays...")
matchday_data = matches.groupby('season_label', group_keys=False).apply(calculate_matchday)
matches['matchday'] = matchday_data['matchday'].values

print("âœ“ Matchdays calculated!")

## Step 3: Verify Matchday Calculation

In [None]:
# Check matchdays for 2023-2024 season
season_23 = matches[matches['season_label'] == '2023-2024']

matchday_summary = season_23.groupby('matchday').agg({
    'date': ['min', 'max'],
    'home_team': 'count'
})
matchday_summary.columns = ['First Match', 'Last Match', 'Num Matches']

print("2023-2024 Season - Matchday Summary:")
print(matchday_summary.head(15))

In [None]:
# Show a specific matchday
md11 = season_23[season_23['matchday'] == 11]
print("\nMatchday 11 - 2023-2024:")
print(md11[['date', 'home_team', 'away_team', 'home_goals', 'away_goals']].to_string(index=False))

## Step 4: Calculate Standings by Matchday

This is the core function - it calculates the cumulative standings after each matchday.

In [None]:
def calculate_season_standings(season_df):
    """
    Calculate cumulative standings at each matchday for a season.
    Returns a DataFrame with standings after each matchday.
    """
    season_df = season_df.sort_values(['matchday', 'date']).copy()
    
    # Get all teams
    all_teams = set(season_df['home_team'].unique()) | set(season_df['away_team'].unique())
    
    # Initialize stats
    team_stats = {team: {
        'played': 0, 'won': 0, 'drawn': 0, 'lost': 0,
        'goals_for': 0, 'goals_against': 0, 'points': 0
    } for team in all_teams}
    
    standings_list = []
    
    # Process each matchday
    for matchday in sorted(season_df['matchday'].unique()):
        matchday_games = season_df[season_df['matchday'] == matchday]
        
        # Update stats for this matchday's matches
        for _, match in matchday_games.iterrows():
            home = match['home_team']
            away = match['away_team']
            home_goals = match['home_goals']
            away_goals = match['away_goals']
            
            # Update games played and goals
            team_stats[home]['played'] += 1
            team_stats[away]['played'] += 1
            team_stats[home]['goals_for'] += home_goals
            team_stats[home]['goals_against'] += away_goals
            team_stats[away]['goals_for'] += away_goals
            team_stats[away]['goals_against'] += home_goals
            
            # Update wins/draws/losses and points
            if home_goals > away_goals:
                team_stats[home]['won'] += 1
                team_stats[home]['points'] += 3
                team_stats[away]['lost'] += 1
            elif home_goals < away_goals:
                team_stats[away]['won'] += 1
                team_stats[away]['points'] += 3
                team_stats[home]['lost'] += 1
            else:
                team_stats[home]['drawn'] += 1
                team_stats[away]['drawn'] += 1
                team_stats[home]['points'] += 1
                team_stats[away]['points'] += 1
        
        # Create standings snapshot for this matchday
        standings_df = pd.DataFrame.from_dict(team_stats, orient='index')
        standings_df['goal_diff'] = standings_df['goals_for'] - standings_df['goals_against']
        
        # Sort by points, then goal difference, then goals for
        standings_df = standings_df.sort_values(
            by=['points', 'goal_diff', 'goals_for'],
            ascending=[False, False, False]
        )
        
        standings_df['position'] = range(1, len(standings_df) + 1)
        standings_df['matchday'] = matchday
        standings_df.index.name = 'team'
        
        standings_list.append(standings_df.reset_index())
    
    return pd.concat(standings_list, ignore_index=True)

print("âœ“ Function defined!")

## Step 5: Calculate Standings for All Seasons

**This will take 30-60 seconds** - we're processing 17 seasons!

In [None]:
from tqdm import tqdm

all_standings = []

seasons = sorted(matches['season_label'].unique())
print(f"Processing {len(seasons)} seasons...\n")

for season in tqdm(seasons, desc="Calculating standings"):
    season_df = matches[matches['season_label'] == season]
    
    try:
        standings = calculate_season_standings(season_df)
        standings['season'] = season
        all_standings.append(standings)
    except Exception as e:
        print(f"Error processing {season}: {e}")

# Combine all standings
complete_standings = pd.concat(all_standings, ignore_index=True)

print(f"\nâœ… Complete! Generated {len(complete_standings)} standing records")
print(f"   Seasons: {complete_standings['season'].nunique()}")
print(f"   Total matchdays: {complete_standings['matchday'].max()}")

## Step 6: Preview the Standings Data

In [None]:
# Look at standings structure
print("Columns in standings:")
print(complete_standings.columns.tolist())
print("\nFirst few records:")
complete_standings.head(10)

## Step 7: Test Query - Matchday 11 Across Seasons

This is exactly what you wanted! Let's compare matchday 11 standings across different seasons.

In [None]:
# Get matchday 11 for recent seasons
md11_all = complete_standings[complete_standings['matchday'] == 11]

# Show top 5 teams for each of the last 5 seasons
recent_seasons = sorted(md11_all['season'].unique())[-5:]

for season in recent_seasons:
    print(f"\n{'='*60}")
    print(f"  {season} - Matchday 11 Standings (Top 5)")
    print(f"{'='*60}")
    
    season_data = md11_all[md11_all['season'] == season].head(5)
    display_cols = ['position', 'team', 'played', 'won', 'drawn', 'lost', 'goals_for', 'goals_against', 'goal_diff', 'points']
    
    print(season_data[display_cols].to_string(index=False))

## Step 8: Save Standings to Database

In [None]:
# Save standings table
complete_standings.to_sql("standings", con=engine, if_exists="replace", index=False)
print(f"âœ… Saved standings to database")

# Also update matches table with matchday numbers
matches.to_sql("matches", con=engine, if_exists="replace", index=False)
print(f"âœ… Updated matches table with matchday numbers")

## Step 9: Quick Database Summary

In [None]:
# Show what's in our database
from sqlalchemy import inspect

inspector = inspect(engine)
tables = inspector.get_table_names()

print("ðŸ“Š Database Summary:")
print(f"\nTables: {tables}")

for table in tables:
    count = pd.read_sql(f"SELECT COUNT(*) as count FROM {table}", engine).iloc[0]['count']
    print(f"  - {table}: {count:,} rows")

## ðŸŽ¯ Success!

Your database now has:
- âœ… **matches** table: All match results with matchday numbers
- âœ… **standings** table: Cumulative standings at each matchday for every season

**You can now query:**
- "Show me where Inter was at matchday 11 in 2020-2021"
- "Compare the top 4 at matchday 15 across the last 5 seasons"
- "What was the standing after matchday 20 in 2015-2016?"

**Next step:** Build the Streamlit dashboard! ðŸš€

---

## Bonus: Example Queries

Here are some example queries you can run:

In [None]:
# Example 1: Track a specific team's position over a season
inter_2023 = complete_standings[
    (complete_standings['team'] == 'Inter') & 
    (complete_standings['season'] == '2023-2024')
]

print("Inter's journey through 2023-2024:")
print(inter_2023[['matchday', 'position', 'played', 'points', 'goal_diff']].head(15).to_string(index=False))

In [None]:
# Example 2: Who was leading at matchday 11 in each season?
leaders_md11 = complete_standings[
    (complete_standings['matchday'] == 11) & 
    (complete_standings['position'] == 1)
]

print("League leaders at matchday 11:")
print(leaders_md11[['season', 'team', 'played', 'points', 'goal_diff']].to_string(index=False))