In [2]:
import pandas as pd
import numpy as np

In [3]:
# load in and clean up the data
df_2024 = pd.read_csv(r"C:\Users\Owner\dev\football-analytics\data-scraping\fbref\scrape-red-card-games\red_card_data_2024.csv")
df_2024["year"] = "2024"

df_2023 = pd.read_csv(r"C:\Users\Owner\dev\football-analytics\data-scraping\fbref\scrape-red-card-games\red_card_data_2023.csv")
df_2023["year"] = "2023"

df_2022 = pd.read_csv(r"C:\Users\Owner\dev\football-analytics\data-scraping\fbref\scrape-red-card-games\red_card_data_2022.csv")
df_2022["year"] = "2022"

df = pd.concat([df_2024, df_2023, df_2022])

df = df[df['Team'] != "0"]

np.sort(df["Team"].unique())

rename_dict = {
    "Wolverhampton Wanderers": "Wolves",
    "West Ham United": "West Ham",
    "Manchester United": "Manchester Utd",
    "Tottenham Hotspur": "Tottenham",
    "Nottingham Forest": "Nott'm Forest",
    "Nott'ham Forest": "Nott'm Forest",
    "Brighton & Hove Albion": "Brighton",
    "Newcastle United": "Newcastle Utd",
}

df["Team"] = df["Team"].replace(rename_dict)

np.sort(df["Team"].unique())

array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton',
       'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham',
       'Ipswich Town', 'Leeds United', 'Leicester City', 'Liverpool',
       'Luton Town', 'Manchester City', 'Manchester Utd', 'Newcastle Utd',
       "Nott'm Forest", 'Sheffield United', 'Sheffield Utd',
       'Southampton', 'Tottenham', 'West Ham', 'Wolves'], dtype=object)

In [4]:
df.tail(50)

Unnamed: 0,Minute,Team,xG,PSxG,Outcome,Event Type,match_url,year
674,7,Everton,0.09,0.39,Saved,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
675,9,Crystal Palace,0.08,0.0,Off Target,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
676,23,Crystal Palace,0.02,0.02,Saved,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
677,26,Crystal Palace,0.04,0.0,Off Target,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
678,32,Everton,0.02,0.35,Saved,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
679,32,Everton,0.05,0.0,Blocked,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
680,35,Everton,0.02,0.0,Blocked,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
681,39,Everton,0.02,0.2,Saved,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
682,41,Crystal Palace,0.04,0.0,Blocked,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022
683,41,Everton,0.02,0.05,Saved,Shot,https://fbref.com/en/matches/25f6dcd1/Crystal-...,2022


In [5]:
match_df = df.groupby(["match_url", "Team"]).agg({
    "xG": "sum",
    "PSxG": "sum",
    "Outcome": lambda x: (x == "Goal").sum()
})

match_df.rename(columns={"Outcome": "Goals"}, inplace=True)

match_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,xG,PSxG,Goals
match_url,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://fbref.com/en/matches/0030e686/Tottenham-Hotspur-Leeds-United-November-12-2022-Premier-League,Leeds United,1.1,2.41,3
https://fbref.com/en/matches/0030e686/Tottenham-Hotspur-Leeds-United-November-12-2022-Premier-League,Tottenham,1.83,1.85,4
https://fbref.com/en/matches/007b352e/Bournemouth-Wolverhampton-Wanderers-October-21-2023-Premier-League,Bournemouth,0.62,1.17,1
https://fbref.com/en/matches/007b352e/Bournemouth-Wolverhampton-Wanderers-October-21-2023-Premier-League,Wolves,2.42,1.31,2
https://fbref.com/en/matches/01e63a1f/Bournemouth-Arsenal-October-19-2024-Premier-League,Arsenal,0.71,0.32,0
https://fbref.com/en/matches/01e63a1f/Bournemouth-Arsenal-October-19-2024-Premier-League,Bournemouth,1.83,1.64,2
https://fbref.com/en/matches/03ac4a9c/West-Ham-United-Crystal-Palace-January-18-2025-Premier-League,Crystal Palace,1.32,1.63,2
https://fbref.com/en/matches/03ac4a9c/West-Ham-United-Crystal-Palace-January-18-2025-Premier-League,West Ham,0.28,0.0,0
https://fbref.com/en/matches/03d28c48/Southampton-Leicester-City-October-19-2024-Premier-League,Leicester City,3.23,2.54,3
https://fbref.com/en/matches/03d28c48/Southampton-Leicester-City-October-19-2024-Premier-League,Southampton,2.16,2.62,2


In [6]:
results = []

for match in df['match_url'].unique():
    match_df = df[df['match_url'] == match].sort_values('Minute')
    teams = match_df['Team'].unique()
    year = match_df['year'].iloc[0]  # Get year for this match
    
    # Initialize player counts for both teams
    team_players = {team: 11 for team in teams}
    
    # Get red card events
    red_cards = match_df[match_df['Outcome'].str.contains('Red Card', na=False)].sort_values('Minute')
    
    # Create periods list for this match
    periods = []
    
    # Add initial period (11v11)
    if len(red_cards) > 0:
        first_red = red_cards.iloc[0]
        # Add shots from start until first red
        shots_period = match_df[
            (match_df['Event Type'].str.contains('Shot', na=False)) &
            (match_df['Minute'] < first_red['Minute'])
        ]
        for team in teams:
            team_shots = shots_period[shots_period['Team'] == team]
            opp_team = teams[1] if team == teams[0] else teams[0]
            opp_shots = shots_period[shots_period['Team'] == opp_team]
            
            periods.append({
                'match_url': match,
                'year': year,  # Add year
                'period': 'equal_players',
                'start_minute': 0,
                'end_minute': first_red['Minute'],
                'minutes_in_period': first_red['Minute'],
                'team': team,
                'shots': len(team_shots),
                'xG': team_shots['xG'].sum(),
                'PSxG': team_shots['PSxG'].sum(),
                'opp_xG': opp_shots['xG'].sum(),
                'opp_PSxG': opp_shots['PSxG'].sum()
            })
    
    # Process each period between red cards
    for idx, red in red_cards.iterrows():
        # Update player count for the team that got the red
        team_players[red['Team']] -= 1
        
        # Get next period end (next red card or end of match)
        if idx < len(red_cards) - 1:
            end_minute = red_cards.iloc[idx + 1]['Minute']
        else:
            end_minute = 90  # Assuming 90 minute matches
            
        # Get shots in this period
        shots_period = match_df[
            (match_df['Event Type'].str.contains('Shot', na=False)) &
            (match_df['Minute'] > red['Minute']) &
            (match_df['Minute'] < end_minute)
        ]
        
        # Determine the state for each team
        for team in teams:
            team_shots = shots_period[shots_period['Team'] == team]
            opp_team = teams[1] if team == teams[0] else teams[0]
            opp_shots = shots_period[shots_period['Team'] == opp_team]
            
            # Determine if team has advantage, disadvantage, or equal players
            if team_players[team] > team_players[opp_team]:
                period_state = 'advantage'
            elif team_players[team] < team_players[opp_team]:
                period_state = 'disadvantage'
            else:
                period_state = 'equal_players'
            
            periods.append({
                'match_url': match,
                'year': year,  # Add year
                'period': period_state,
                'start_minute': red['Minute'],
                'end_minute': end_minute,
                'minutes_in_period': end_minute - red['Minute'],
                'team': team,
                'shots': len(team_shots),
                'xG': team_shots['xG'].sum(),
                'PSxG': team_shots['PSxG'].sum(),
                'opp_xG': opp_shots['xG'].sum(),
                'opp_PSxG': opp_shots['PSxG'].sum(),
                'team_players': team_players[team],
                'opp_players': team_players[opp_team],
                'red_card_team': red['Team']
            })
    
    results.extend(periods)

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Calculate summary statistics - now including year in the grouping
summary = (results_df
    .groupby(['team', 'year', 'period'])
    .agg({
        'minutes_in_period': 'sum',
        'shots': 'sum',
        'xG': 'sum',
        'PSxG': 'sum',
        'opp_xG': 'sum',
        'opp_PSxG': 'sum'
    })
    .round(3)
)

# Add xG per minute
summary['xG_per_min'] = (summary['xG'] / summary['minutes_in_period']).round(3)
summary['opp_xG_per_min'] = (summary['opp_xG'] / summary['minutes_in_period']).round(3)

results_df[results_df["year"] == "2024"].tail(50)

Unnamed: 0,match_url,year,period,start_minute,end_minute,minutes_in_period,team,shots,xG,PSxG,opp_xG,opp_PSxG,team_players,opp_players,red_card_team
72,https://fbref.com/en/matches/dd7675a7/Everton-...,2024,equal_players,0,41,41,Everton,10,0.35,0.26,0.98,0.36,,,
73,https://fbref.com/en/matches/dd7675a7/Everton-...,2024,equal_players,0,41,41,Brentford,6,0.98,0.36,0.35,0.26,,,
74,https://fbref.com/en/matches/dd7675a7/Everton-...,2024,advantage,41,90,49,Everton,14,0.72,0.5,0.11,0.02,11.0,10.0,Brentford
75,https://fbref.com/en/matches/dd7675a7/Everton-...,2024,disadvantage,41,90,49,Brentford,3,0.11,0.02,0.72,0.5,10.0,11.0,Brentford
76,https://fbref.com/en/matches/9aaa6ed5/Tottenha...,2024,equal_players,0,83,83,Tottenham,7,0.75,1.05,1.58,1.66,,,
77,https://fbref.com/en/matches/9aaa6ed5/Tottenha...,2024,equal_players,0,83,83,Fulham,14,1.58,1.66,0.75,1.05,,,
78,https://fbref.com/en/matches/9aaa6ed5/Tottenha...,2024,advantage,83,90,7,Tottenham,1,0.02,0.0,0.0,0.0,11.0,10.0,Fulham
79,https://fbref.com/en/matches/9aaa6ed5/Tottenha...,2024,disadvantage,83,90,7,Fulham,0,0.0,0.0,0.02,0.0,10.0,11.0,Fulham
80,https://fbref.com/en/matches/e4480630/Southamp...,2024,equal_players,0,39,39,Southampton,4,1.04,1.29,2.26,3.44,,,
81,https://fbref.com/en/matches/e4480630/Southamp...,2024,equal_players,0,39,39,Chelsea,11,2.26,3.44,1.04,1.29,,,


In [7]:

summary



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,minutes_in_period,shots,xG,PSxG,opp_xG,opp_PSxG,xG_per_min,opp_xG_per_min
team,year,period,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Arsenal,2022,advantage,28,4,0.24,0.07,0.01,0.00,0.009,0.000
Arsenal,2022,equal_players,62,15,2.08,1.81,1.56,1.11,0.034,0.025
Arsenal,2023,advantage,9,0,0.00,0.00,0.11,0.31,0.000,0.012
Arsenal,2023,disadvantage,30,0,0.00,0.00,0.42,0.00,0.000,0.014
Arsenal,2023,equal_players,321,57,10.17,7.20,1.25,1.26,0.032,0.004
...,...,...,...,...,...,...,...,...,...,...
Wolves,2023,disadvantage,113,5,0.97,1.41,5.39,5.82,0.009,0.048
Wolves,2023,equal_players,465,49,6.64,5.96,9.47,8.93,0.014,0.020
Wolves,2024,advantage,90,10,0.40,0.81,0.58,0.75,0.004,0.006
Wolves,2024,disadvantage,0,0,0.00,0.00,0.00,0.00,,


In [8]:
disadvantage_summary = summary.loc[(slice(None), slice(None), 'disadvantage'), :]

disadvantage_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,minutes_in_period,shots,xG,PSxG,opp_xG,opp_PSxG,xG_per_min,opp_xG_per_min
team,year,period,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Arsenal,2023,disadvantage,30,0,0.0,0.0,0.42,0.0,0.0,0.014
Arsenal,2024,disadvantage,193,12,2.22,2.47,4.85,4.1,0.012,0.025
Aston Villa,2022,disadvantage,28,1,0.09,0.0,0.85,0.88,0.003,0.03
Aston Villa,2023,disadvantage,25,3,0.31,0.13,0.16,0.0,0.012,0.006
Aston Villa,2024,disadvantage,58,1,0.19,0.0,2.36,1.87,0.003,0.041
Bournemouth,2023,disadvantage,53,3,0.12,0.42,1.73,1.16,0.002,0.033
Brentford,2022,disadvantage,0,0,0.0,0.0,0.0,0.0,,
Brentford,2023,disadvantage,101,9,1.17,0.67,3.32,3.1,0.012,0.033
Brentford,2024,disadvantage,49,3,0.11,0.02,0.72,0.5,0.002,0.015
Brighton,2023,disadvantage,47,3,0.14,0.72,1.16,1.02,0.003,0.025


In [9]:
import plotly.graph_objects as go
from datetime import datetime


current_date = datetime.now().strftime("%d %b %Y")
plot_df = disadvantage_summary.reset_index()

plot_df = plot_df[plot_df["year"] == "2024"]

plot_df.sort_values('minutes_in_period', inplace=True)

# Create figure
fig = go.Figure()

# Add bars
fig.add_trace(
    go.Bar(
        x=plot_df['minutes_in_period'],
        y=plot_df['team'],
        orientation='h',
        marker_color='#EC325A',  
        text=plot_df['minutes_in_period'].astype(str) + ' min',
        textposition='inside',
    )
)

# Update layout for dark theme
fig.update_layout(
    title={
        'text': 'Minutes Played with a Player Disadvantage<br><span style="font-size: 14px;">Premier League 2024/25</span>',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 20, 'weight': 'bold'}
    },
    plot_bgcolor='#1f2937',  # Dark background
    paper_bgcolor='#1f2937',
    font={'color': 'white'},
    height=800,  # Taller to fit all teams
    width=900,
    margin=dict(l=30, r=10, t=80, b=70),
    showlegend=False,
    xaxis=dict(
        showgrid=True,
        gridcolor='rgba(255, 255, 255, 0.1)',
        zeroline=False,
        title='Minutes'
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        ticksuffix=' '
    ),
    annotations=[
        dict(
            text=f"@porterhouse26 • {current_date}",  
            xref="paper",
            yref="paper",
            x=1,
            y=-0.1,
            showarrow=False,
            font=dict(size=12, color='gray'),
            xanchor='right'
        )
    ]
)

# Update bar configurations
fig.update_traces(
    marker_line_color='#1f2937',
    marker_line_width=1,
    opacity=0.8
)

# Save as HTML (interactive)
fig.write_html("disadvantage_minutes_2024.html")


In [10]:
import plotly.graph_objects as go
from datetime import datetime


current_date = datetime.now().strftime("%d %b %Y")
plot_df = disadvantage_summary.reset_index()

plot_df = plot_df[plot_df["year"] == "2024"]

plot_df.sort_values('minutes_in_period', inplace=True)

# Create figure
fig = go.Figure()

# Add bars
fig.add_trace(
    go.Bar(
        x=plot_df['minutes_in_period'],
        y=plot_df['team'],
        orientation='h',
        marker_color='#EC325A',  
        text=plot_df['minutes_in_period'].astype(str) + ' min',
        textposition='inside',
    )
)

# Update layout for dark theme
fig.update_layout(
    title={
        'text': 'Minutes Played with a Player Disadvantage<br><span style="font-size: 14px;">Premier League 2024/25</span>',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 20, 'weight': 'bold'}
    },
    plot_bgcolor='#1f2937',  # Dark background
    paper_bgcolor='#1f2937',
    font={'color': 'white'},
    height=800,  # Taller to fit all teams
    width=900,
    margin=dict(l=30, r=10, t=80, b=70),
    showlegend=False,
    xaxis=dict(
        showgrid=True,
        gridcolor='rgba(255, 255, 255, 0.1)',
        zeroline=False,
        title='Minutes'
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        ticksuffix=' '
    ),
    annotations=[
        dict(
            text=f"@porterhouse26 • {current_date}",  
            xref="paper",
            yref="paper",
            x=1,
            y=-0.1,
            showarrow=False,
            font=dict(size=12, color='gray'),
            xanchor='right'
        )
    ]
)

# Update bar configurations
fig.update_traces(
    marker_line_color='#1f2937',
    marker_line_width=1,
    opacity=0.8
)

# Save as HTML (interactive)
fig.write_html("disadvantage_minutes_2024.html")


In [27]:
import plotly.graph_objects as go
from datetime import datetime

current_date = datetime.now().strftime("%d %b %Y")
plot_df = disadvantage_summary.reset_index()

# Calculate total minutes per team for sorting
total_minutes_df = plot_df.groupby('team')['minutes_in_period'].sum().sort_values(ascending=False)
team_order = total_minutes_df.index.tolist()

# Create figure
fig = go.Figure()

# Define colors for different years
colors = {
    '2024': '#e67e22',  # Orange
    '2023': '#2ecc71',  # Green
    '2022': '#3498db'   # Blue
}

# Add bars for each year
for year in sorted(plot_df['year'].unique()):
    year_df = plot_df[plot_df['year'] == year]
    
    # For teams not in this year, we need to add them with 0 minutes
    missing_teams = set(team_order) - set(year_df['team'])
    zeros_df = pd.DataFrame({
        'team': list(missing_teams),
        'year': year,
        'minutes_in_period': 0
    })
    
    year_df = pd.concat([year_df, zeros_df])
    year_df = year_df.set_index('team').reindex(team_order).reset_index()
    
    # Only show text label for the last bar segment that has minutes
    text = None
    if year == max(plot_df['year'].unique()):
        text = total_minutes_df[year_df['team']].round(1).astype(str)
    
    fig.add_trace(
        go.Bar(
            x=year_df['minutes_in_period'],
            y=year_df['team'],
            orientation='h',
            name=str(year),
            marker_color=colors.get(str(year), '#EC325A'),
            text=text,
            textposition='outside',
            showlegend=True
        )
    )

# Update layout for dark theme
fig.update_layout(
    barmode='stack',
    title={
        'text': 'Minutes Played with a Player Disadvantage by Year<br><span style="font-size: 14px;">Premier League</span>',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 20, 'weight': 'bold'}
    },
    plot_bgcolor='#1f2937',
    paper_bgcolor='#1f2937',
    font={'color': 'white'},
    height=800,
    width=900,
    margin=dict(l=30, r=25, t=80, b=70),
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.005,
        xanchor="right",
        x=1
    ),
    xaxis=dict(
        showgrid=True,
        gridcolor='rgba(255, 255, 255, 0.1)',
        zeroline=False,
        title='Minutes'
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        ticksuffix=' ',
        autorange='reversed'  # This will maintain the order
    ),
    annotations=[
        dict(
            text=f"@porterhouse26 • {current_date}",
            xref="paper",
            yref="paper",
            x=1.01,
            y=-0.1,
            showarrow=False,
            font=dict(size=12, color='gray'),
            xanchor='right'
        )
    ]
)

# Update bar configurations
fig.update_traces(
    marker_line_color='#1f2937',
    marker_line_width=1,
    opacity=0.8
)

# Save as HTML (interactive)
fig.write_html("disadvantage_minutes_all_years.html")