In [1]:
import pandas as pd
import numpy as np

In [13]:
# This function is used to transform our dataframe from a wide format to a long format
def transform_premier_league_data(df):
    """
    Transform Premier League match data from wide format to team-based format with extended metrics.
    
    Parameters:
    df (pandas.DataFrame): Input dataframe with match data in wide format
    
    Returns:
    pandas.DataFrame: Transformed dataframe with team-centric statistics and ratios
    """
    
    team_data = []
    
    for _, row in df.iterrows():
        # Home team data
        home_stats = {
            'date': row['date'],  # Added date field
            'time': row['time'],  # Added time field
            'round': row['round'],
            'team': row['home_team'],
            'opponent': row['away_team'],
            'team_code': row['home_team_code'],
            'opponent_code': row['away_team_code'],
            'is_home': 1,
            # Basic stats
            'goals': row['home_goals'],
            'conceded_goals': row['away_goals'],
            'possession': row['home_poss'],
            'shots': row['home_shots'],
            'conceded_shots': row['away_shots'],
            'shots_on_goal': row['home_shots_on_goal'],
            'conceded_shots_on_goal': row['away_shots_on_goal'],
            'goalkeeper_saves': row['home_goalkeeper_saves'],
            'blocked_shots': row['home_blocked_shots'],
            'shots_off_goal': row['home_shots_off_goal'],
            'chances': row['home_chances'],
            # Ratios
            'shot_creation_ratio': row['home_shot_creation_ratio'],
            'target_ratio': row['home_target_ratio'],
            'conversion_rate': row['home_conversion_rate'],
            'target_to_goal_ratio': row['home_target_to_goal_ratio']
        }
        
        # Away team data
        away_stats = {
            'date': row['date'],  # Added date field
            'time': row['time'],  # Added time field
            'round': row['round'],
            'team': row['away_team'],
            'opponent': row['home_team'],
            'team_code': row['away_team_code'],
            'opponent_code': row['home_team_code'],
            'is_home': 0,
            # Basic stats
            'goals': row['away_goals'],
            'conceded_goals': row['home_goals'],
            'possession': row['away_poss'],
            'shots': row['away_shots'],
            'conceded_shots': row['home_shots'],
            'shots_on_goal': row['away_shots_on_goal'],
            'conceded_shots_on_goal': row['home_shots_on_goal'],
            'goalkeeper_saves': row['away_goalkeeper_saves'],
            'blocked_shots': row['away_blocked_shots'],
            'shots_off_goal': row['away_shots_off_goal'],
            'chances': row['away_chances'],
            # Ratios
            'shot_creation_ratio': row['away_shot_creation_ratio'],
            'target_ratio': row['away_target_ratio'],
            'conversion_rate': row['away_conversion_rate'],
            'target_to_goal_ratio': row['away_target_to_goal_ratio']
        }
        
        team_data.extend([home_stats, away_stats])
    
    # Create new dataframe
    team_df = pd.DataFrame(team_data)
    
    # Sort by team and round
    team_df = team_df.sort_values(['team', 'round']).reset_index(drop=True)
    
    # Calculate rolling averages for all relevant metrics
    metrics_to_average = [
        # Basic stats
        'goals', 'conceded_goals', 'possession', 'shots', 'conceded_shots',
        'shots_on_goal', 'conceded_shots_on_goal', 'goalkeeper_saves', 'blocked_shots',
        'shots_off_goal', 'chances',
        # Ratios
        'shot_creation_ratio', 'target_ratio', 'conversion_rate',
        'target_to_goal_ratio'
    ]
    
    # Add rolling averages with suffix _rolling_avg
    for metric in metrics_to_average:
        team_df[f'{metric}_rolling_avg'] = team_df.groupby('team')[metric].transform(
            lambda x: x.rolling(window=5, min_periods=1).mean()
        )
    
    # Reorder columns for better organization
    column_order = [
        # Identifiers
        'date', 'time','round', 'team', 'opponent', 'team_code', 'opponent_code', 'is_home',
        # Basic stats
        'goals', 'conceded_goals', 'possession', 'shots', 'conceded_shots',
        'shots_on_goal', 'conceded_shots_on_goal', 'goalkeeper_saves', 'blocked_shots',
        'shots_off_goal', 'chances',
        # Ratios
        'shot_creation_ratio', 'target_ratio', 'conversion_rate',
        'target_to_goal_ratio'
    ]
    
    # Add rolling averages to column order
    column_order.extend([f'{metric}_rolling_avg' for metric in metrics_to_average])
    
    # Reorder columns
    team_df = team_df[column_order]
    
    return team_df


In [14]:
def assign_consistent_team_codes(df):
    ''' 
        The function assumes that the dataframe has a home_team_code column which has been assigned using pandas like above
     '''
    # Create a dictionary to store team names and their codes
    team_codes = {}

    # Assign codes for home teams
    for index, row in df.iterrows():
        if row['home_team'] not in team_codes:
            team_codes[row['home_team']] = row['home_team_code']
    
    # Assign consistent codes for both home and away teams
    df['home_team_code'] = df['home_team'].map(team_codes)
    df['away_team_code'] = df['away_team'].map(team_codes)

    # Verify the results
    print("Team Codes:")
    for team, code in sorted(team_codes.items(), key=lambda x: x[1]):
        print(f"{team}: {code}")


    # Check for any NaN values in team codes (which would indicate a missing team)
    if df['home_team_code'].isnull().any() or df['away_team_code'].isnull().any():
        print("\nWarning: Some teams don't have assigned codes. Please check your data.")
    else:
        print("\nAll teams have been assigned consistent codes.")

    # Verify if the number of unique teams matches the number of unique codes
    num_teams = len(set(df['home_team']) | set(df['away_team']))
    num_codes = len(set(df['home_team_code']) | set(df['away_team_code']))
    
    # print(f"\nNumber of unique teams: {num_teams}")
    # print(f"Number of unique codes: {num_codes}")
    
    # if num_teams == num_codes:
        # print("Verification successful: Number of unique teams matches number of unique codes.")
    # else:
        # print("Verification failed: Number of unique teams does not match number of unique codes.")

    return df, team_codes

In [15]:
pl_data = pd.DataFrame({})

years = ['2023_2024', '2022_2023', '2021_2022', '2018_2019', '2017_2018', '2016_2017', '2015_2016', '2014_2015', '2013_2014']



for year in years:
    test_df = pd.read_json(f'../season_data/game_data_premier_league_{year}.json')
    pl_data = pd.concat([pl_data, test_df], ignore_index=True)

# print(myDf.shape)

pl_data['home_team_code'] = pl_data['home_team'].astype('category').cat.codes

assign_consistent_team_codes(pl_data)

print(pl_data.shape)

# myDf.head()

null_values = pl_data.isnull().sum()

# print(null_values)

# print(pl_data.columns)
# print(pl_data.shape)
print(pl_data.head())

Team Codes:
Arsenal: 0
Aston Villa: 1
Bournemouth: 2
Brentford: 3
Brighton: 4
Burnley: 5
Cardiff: 6
Chelsea: 7
Crystal Palace: 8
Everton: 9
Fulham: 10
Huddersfield: 11
Hull: 12
Leeds: 13
Leicester: 14
Liverpool: 15
Luton: 16
Manchester City: 17
Manchester Utd: 18
Middlesbrough: 19
Newcastle: 20
Norwich: 21
Nottingham: 22
QPR: 23
Sheffield Utd: 24
Southampton: 25
Stoke: 26
Sunderland: 27
Swansea: 28
Tottenham: 29
Watford: 30
West Brom: 31
West Ham: 32
Wolves: 33

All teams have been assigned consistent codes.
(3420, 54)
             comp     round  home_team       away_team  home_goals  \
0  PREMIER LEAGUE  ROUND 38    Arsenal         Everton           2   
1  PREMIER LEAGUE  ROUND 38  Brentford       Newcastle           2   
2  PREMIER LEAGUE  ROUND 38   Brighton  Manchester Utd           0   
3  PREMIER LEAGUE  ROUND 38    Burnley      Nottingham           1   
4  PREMIER LEAGUE  ROUND 38    Chelsea     Bournemouth           2   

   away_goals       date   time  home_expected_goals_(

In [16]:
# I need to first get the relevant attributes

relevant_columns =  ['round','date','time', 'home_team', 'away_team', 'home_goals', 'away_goals',
       'home_ball_possession', 'away_ball_possession', 'home_goal_attempts', 'away_goal_attempts', 'home_shots_on_goal', 'away_shots_on_goal',
        'home_corner_kicks', 'away_corner_kicks', 'home_goalkeeper_saves', 'away_goalkeeper_saves', 'home_attacks', 'away_attacks',
       'home_dangerous_attacks', 'away_dangerous_attacks', 'home_blocked_shots', 'away_blocked_shots', 'home_team_code', 'away_team_code']

# Next, I will only include the columns that are in the original dataframe

present_columns = [column for column in relevant_columns if column in pl_data.columns]

pl_data = pl_data[present_columns]

pl_data = pl_data.rename(columns={'home_goal_attempts': 'home_shots', 'away_goal_attempts': 'away_shots', 'home_ball_possession': 'home_poss', 'away_ball_possession': 'away_poss'})
print(pl_data.shape)

# This is where I fill out the nan values for each nan row where applicable
pl_data['home_blocked_shots'] = pl_data['home_blocked_shots'].fillna(0)
pl_data['away_blocked_shots'] = pl_data['away_blocked_shots'].fillna(0)
pl_data['home_attacks'] = pl_data['home_attacks'].fillna(0)
pl_data['away_attacks'] = pl_data['away_attacks'].fillna(0)
pl_data['home_dangerous_attacks'] = pl_data['home_dangerous_attacks'].fillna(0)
pl_data['away_dangerous_attacks'] = pl_data['away_dangerous_attacks'].fillna(0)

(3420, 25)


In [17]:
pl_data.isnull().sum()

# After filling out the appropriate entries with nan values, I will drop the remaining entries with nan.
print("Before dropping: ", pl_data.shape)
pl_data = pl_data.dropna()
print("After dropping: ", pl_data.shape)

pl_data.columns

Before dropping:  (3420, 25)
After dropping:  (3391, 25)


Index(['round', 'date', 'time', 'home_team', 'away_team', 'home_goals',
       'away_goals', 'home_poss', 'away_poss', 'home_shots', 'away_shots',
       'home_shots_on_goal', 'away_shots_on_goal', 'home_corner_kicks',
       'away_corner_kicks', 'home_goalkeeper_saves', 'away_goalkeeper_saves',
       'home_attacks', 'away_attacks', 'home_dangerous_attacks',
       'away_dangerous_attacks', 'home_blocked_shots', 'away_blocked_shots',
       'home_team_code', 'away_team_code'],
      dtype='object')

In [18]:
# Next, I want to derive some relevant fields
# shots_off_goal: home_shots - blocked_shots - home_shots_on_goal
pl_data['home_shots_off_goal'] = pl_data['home_shots'] - pl_data['home_shots_on_goal'] - pl_data['home_blocked_shots']
pl_data['away_shots_off_goal'] = pl_data['away_shots'] - pl_data['away_shots_on_goal'] - pl_data['away_blocked_shots']
# chances: corners + attacks
pl_data['home_chances'] = pl_data['home_corner_kicks'] + pl_data['home_attacks']
pl_data['away_chances'] = pl_data['away_corner_kicks'] + pl_data['away_attacks']

# After deriving relevant fields, I need to ensure each attribute is a numeric datatype
# Cleaning the data fields and inputs
# First, the round column should be an int. 
pl_data['round'] = pl_data['round'].apply(lambda x: int(x.split()[1]) if isinstance(x, str) else x)
# Next, the possession stats should be normalized values between 0 - 1
pl_data['home_poss'] = pl_data['home_poss'].apply(lambda x: float(x.strip('%'))/100 if isinstance(x, str) else x)
pl_data['away_poss'] = pl_data['away_poss'].apply(lambda x: float(x.strip('%'))/100 if isinstance(x, str) else x)

In [19]:
# Deriving special metrics:

# Replace 0 with 1 in all denominator columns to avoid division by zero
pl_data['home_chances'] = pl_data['home_chances'].replace(0, 1)
pl_data['away_chances'] = pl_data['away_chances'].replace(0, 1)
pl_data['home_shots'] = pl_data['home_shots'].replace(0, 1)
pl_data['away_shots'] = pl_data['away_shots'].replace(0, 1)
pl_data['home_shots_on_goal'] = pl_data['home_shots_on_goal'].replace(0, 1)
pl_data['away_shots_on_goal'] = pl_data['away_shots_on_goal'].replace(0, 1)

# Shot creation ratio. Tells how much shots a team has from their chances created
pl_data['home_shot_creation_ratio'] = pl_data['home_shots'] / pl_data['home_chances']
pl_data['away_shot_creation_ratio'] = pl_data['away_shots'] / pl_data['away_chances']

# Target rate. Gives a good measure of how accurate a team's attempts are
pl_data['home_target_ratio'] = pl_data['home_shots_on_goal'] / pl_data['home_shots']
pl_data['away_target_ratio'] = pl_data['away_shots_on_goal'] / pl_data['away_shots']

# Conversion rate. Gives a good measure of goals to shots taken
pl_data['home_conversion_rate'] = pl_data['home_goals'] / pl_data['home_shots']
pl_data['away_conversion_rate'] = pl_data['away_goals'] / pl_data['away_shots']

# Target to goal ratio. Gives a measure of goals scored to shots on target
pl_data['home_target_to_goal_ratio'] = pl_data['home_goals'] / pl_data['home_shots_on_goal']
pl_data['away_target_to_goal_ratio'] = pl_data['away_goals'] / pl_data['away_shots_on_goal']


In [20]:
# Next, ensure each field's data type is suitable
pl_data.info()
print(pl_data.shape)
pl_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 3391 entries, 0 to 3419
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   round                      3391 non-null   int64         
 1   date                       3391 non-null   datetime64[ns]
 2   time                       3391 non-null   object        
 3   home_team                  3391 non-null   object        
 4   away_team                  3391 non-null   object        
 5   home_goals                 3391 non-null   int64         
 6   away_goals                 3391 non-null   int64         
 7   home_poss                  3391 non-null   float64       
 8   away_poss                  3391 non-null   float64       
 9   home_shots                 3391 non-null   float64       
 10  away_shots                 3391 non-null   float64       
 11  home_shots_on_goal         3391 non-null   float64       
 12  away_shots_

round                        0
date                         0
time                         0
home_team                    0
away_team                    0
home_goals                   0
away_goals                   0
home_poss                    0
away_poss                    0
home_shots                   0
away_shots                   0
home_shots_on_goal           0
away_shots_on_goal           0
home_corner_kicks            0
away_corner_kicks            0
home_goalkeeper_saves        0
away_goalkeeper_saves        0
home_attacks                 0
away_attacks                 0
home_dangerous_attacks       0
away_dangerous_attacks       0
home_blocked_shots           0
away_blocked_shots           0
home_team_code               0
away_team_code               0
home_shots_off_goal          0
away_shots_off_goal          0
home_chances                 0
away_chances                 0
home_shot_creation_ratio     0
away_shot_creation_ratio     0
home_target_ratio            0
away_tar

In [21]:
pl_team_data = transform_premier_league_data(pl_data)
print(pl_team_data.isnull().sum())
print(pl_team_data.shape)
pl_team_data.to_json("../processed_data/pl_data_dt.json", orient="records")
pl_team_data.head()

date                                  0
time                                  0
round                                 0
team                                  0
opponent                              0
team_code                             0
opponent_code                         0
is_home                               0
goals                                 0
conceded_goals                        0
possession                            0
shots                                 0
conceded_shots                        0
shots_on_goal                         0
conceded_shots_on_goal                0
goalkeeper_saves                      0
blocked_shots                         0
shots_off_goal                        0
chances                               0
shot_creation_ratio                   0
target_ratio                          0
conversion_rate                       0
target_to_goal_ratio                  0
goals_rolling_avg                     0
conceded_goals_rolling_avg            0


Unnamed: 0,date,time,round,team,opponent,team_code,opponent_code,is_home,goals,conceded_goals,...,shots_on_goal_rolling_avg,conceded_shots_on_goal_rolling_avg,goalkeeper_saves_rolling_avg,blocked_shots_rolling_avg,shots_off_goal_rolling_avg,chances_rolling_avg,shot_creation_ratio_rolling_avg,target_ratio_rolling_avg,conversion_rate_rolling_avg,target_to_goal_ratio_rolling_avg
0,2023-12-08,07:30,1,Arsenal,Nottingham,0,22,1,2,1,...,7.0,2.0,1.0,0.0,8.0,138.0,0.108696,0.466667,0.133333,0.285714
1,2022-05-08,15:00,1,Arsenal,Crystal Palace,0,8,0,2,0,...,4.5,2.0,1.5,2.0,6.0,114.5,0.109293,0.333333,0.166667,0.642857
2,2021-08-13,15:00,1,Arsenal,Brentford,0,3,0,0,2,...,4.333333,2.333333,1.333333,2.666667,8.666667,123.666667,0.124505,0.282828,0.111111,0.428571
3,2018-12-08,11:00,1,Arsenal,Manchester City,0,17,1,0,2,...,4.0,3.75,2.5,2.5,7.5,117.75,0.115879,0.295455,0.083333,0.321429
4,2017-11-08,14:45,1,Arsenal,Leicester,0,14,1,4,3,...,5.2,3.6,2.0,3.8,7.6,96.0,0.692703,0.310438,0.096296,0.337143


In [45]:
# Creating team specific files to show historic performances

import numpy as np
import json
import os

def convert_to_json_serializable(obj):
    """Convert numpy types to native Python types"""
    if isinstance(obj, (np.int64, np.int32, np.int16, np.int8)):
        return int(obj)
    elif isinstance(obj, (np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, dict):
        return {key: convert_to_json_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [convert_to_json_serializable(item) for item in obj]
    return obj

# First transform the data as before
pl_team_data = transform_premier_league_data(pl_data)
print(pl_team_data.isnull().sum())
print(pl_team_data.shape)

# Create base directory for team data
base_dir = '../processed_data/premier_league'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Group data by team
for team in pl_team_data['team'].unique():
    # Create team directory
    team_dir = f"{base_dir}/{team.lower().replace(' ', '_')}"
    if not os.path.exists(team_dir):
        os.makedirs(team_dir)
    
    # Get this team's data
    team_season_data = pl_team_data[pl_team_data['team'] == team].copy()
    
    # Process each season
    for year in ['2023_2024', '2022_2023', '2021_2022', '2018_2019', 
                 '2017_2018', '2016_2017', '2015_2016', '2014_2015', '2013_2014']:
        # Get data for this season
        season_data = {
            "team": team,
            "team_code": int(team_season_data['team_code'].iloc[0]),  # Convert to int
            "season": year,
            "rounds": {}
        }
        
        # Add round-by-round data
        season_rows = team_season_data[team_season_data['round'] <= 38]
        for _, row in season_rows.iterrows():
            round_data = {
                "opponent": row['opponent'],
                "opponent_code": int(row['opponent_code']),  # Convert to int
                "is_home": int(row['is_home']),
                # Basic stats
                "goals": int(row['goals']),
                "conceded_goals": int(row['conceded_goals']),
                "possession": float(row['possession']),
                "shots": float(row['shots']),
                "conceded_shots": float(row['conceded_shots']),
                "shots_on_goal": float(row['shots_on_goal']),
                "conceded_shots_on_goal": float(row['conceded_shots_on_goal']),
                "goalkeeper_saves": float(row['goalkeeper_saves']),
                "blocked_shots": float(row['blocked_shots']),
                "shots_off_goal": float(row['shots_off_goal']),
                "chances": float(row['chances']),
                # Ratios
                "shot_creation_ratio": float(row['shot_creation_ratio']),
                "target_ratio": float(row['target_ratio']),
                "conversion_rate": float(row['conversion_rate']),
                "target_to_goal_ratio": float(row['target_to_goal_ratio']),
                # Rolling averages
                "goals_rolling_avg": float(row['goals_rolling_avg']),
                "conceded_goals_rolling_avg": float(row['conceded_goals_rolling_avg']),
                "possession_rolling_avg": float(row['possession_rolling_avg']),
                "shots_rolling_avg": float(row['shots_rolling_avg']),
                "conceded_shots_rolling_avg": float(row['conceded_shots_rolling_avg']),
                "shots_on_goal_rolling_avg": float(row['shots_on_goal_rolling_avg']),
                "conceded_shots_on_goal_rolling_avg": float(row['conceded_shots_on_goal_rolling_avg']),
                "goalkeeper_saves_rolling_avg": float(row['goalkeeper_saves_rolling_avg']),
                "blocked_shots_rolling_avg": float(row['blocked_shots_rolling_avg']),
                "shots_off_goal_rolling_avg": float(row['shots_off_goal_rolling_avg']),
                "chances_rolling_avg": float(row['chances_rolling_avg']),
                "shot_creation_ratio_rolling_avg": float(row['shot_creation_ratio_rolling_avg']),
                "target_ratio_rolling_avg": float(row['target_ratio_rolling_avg']),
                "conversion_rate_rolling_avg": float(row['conversion_rate_rolling_avg']),
                "target_to_goal_ratio_rolling_avg": float(row['target_to_goal_ratio_rolling_avg'])
            }
            
            season_data["rounds"][str(int(row['round']))] = convert_to_json_serializable(round_data)
        
        # Save season data to file
        season_file = f"{team_dir}/{year}.json"
        with open(season_file, 'w') as f:
            json.dump(season_data, f, indent=4)

print("Finished saving team season files")
pl_team_data.head()

round                                 0
team                                  0
opponent                              0
team_code                             0
opponent_code                         0
is_home                               0
goals                                 0
conceded_goals                        0
possession                            0
shots                                 0
conceded_shots                        0
shots_on_goal                         0
conceded_shots_on_goal                0
goalkeeper_saves                      0
blocked_shots                         0
shots_off_goal                        0
chances                               0
shot_creation_ratio                   0
target_ratio                          0
conversion_rate                       0
target_to_goal_ratio                  0
goals_rolling_avg                     0
conceded_goals_rolling_avg            0
possession_rolling_avg                0
shots_rolling_avg                     0


Unnamed: 0,round,team,opponent,team_code,opponent_code,is_home,goals,conceded_goals,possession,shots,...,shots_on_goal_rolling_avg,conceded_shots_on_goal_rolling_avg,goalkeeper_saves_rolling_avg,blocked_shots_rolling_avg,shots_off_goal_rolling_avg,chances_rolling_avg,shot_creation_ratio_rolling_avg,target_ratio_rolling_avg,conversion_rate_rolling_avg,target_to_goal_ratio_rolling_avg
0,1,Arsenal,Brentford,0,2,0,0,2,0.65,22,...,4.0,3.0,1.0,4.0,14.0,142.0,0.15493,0.181818,0.0,0.0
1,2,Arsenal,Chelsea,0,5,1,0,2,0.35,6,...,3.5,4.0,2.0,2.5,8.0,120.5,0.107768,0.340909,0.0,0.0
2,3,Arsenal,Manchester City,0,11,0,0,5,0.19,1,...,2.666667,6.0,3.0,1.666667,5.666667,97.0,0.078512,0.560606,0.0,0.0
3,4,Arsenal,Norwich,0,14,1,1,0,0.52,30,...,3.75,4.75,2.5,3.75,7.5,102.25,0.122443,0.478788,0.008333,0.035714
4,5,Arsenal,Burnley,0,4,0,1,0,0.55,13,...,3.6,4.4,2.6,4.2,6.8,102.0,0.123697,0.429184,0.022051,0.095238


In [147]:
#Next, I only want specific data fields/columns 

# For context, these are all the columns in this dataset: 
'''['comp', 'round', 'home_team', 'away_team', 'home_goals', 'away_goals',
       'date', 'time', 'home_expected_goals_(xg)', 'away_expected_goals_(xg)',
       'home_ball_possession', 'away_ball_possession', 'home_goal_attempts',
       'away_goal_attempts', 'home_shots_on_goal', 'away_shots_on_goal',
       'home_shots_off_goal', 'away_shots_off_goal', 'home_free_kicks',
       'away_free_kicks', 'home_corner_kicks', 'away_corner_kicks',
       'home_offsides', 'away_offsides', 'home_throw-ins', 'away_throw-ins',
       'home_goalkeeper_saves', 'away_goalkeeper_saves', 'home_fouls',
       'away_fouls', 'home_yellow_cards', 'away_yellow_cards',
       'home_total_passes', 'away_total_passes', 'home_tackles',
       'away_tackles', 'home_attacks', 'away_attacks',
       'home_dangerous_attacks', 'away_dangerous_attacks',
       'home_clearances_completed', 'away_clearances_completed',
       'home_blocked_shots', 'away_blocked_shots', 'home_red_cards',
       'away_red_cards']'''

# These are the relevant columns. These columns will give a good measure of a team's attacking and defensive proficiency:

'''[round, home_team, away_team, home_goals, away_goals, home_poss, away_poss,
     home_shots, away_shots, home_shots_on_target, away_shots_on_target, home_shots_off_target, away_shots_off_target,
     home_corner_kicks, away_corner_kicks, home_goalkeeper_saves, away_goalkeeper_saves, home_attacks, away_attacks, 
     home_dangerous_attacks, away_dangerous_attacks, home_blocked_shots, away_blocked_shots
     ]'''

'[round, home_team, away_team, home_goals, away_goals, home_poss, away_poss,\n     home_shots, away_shots, home_shots_on_target, away_shots_on_target, home_shots_off_target, away_shots_off_target,\n     home_corner_kicks, away_corner_kicks, home_goalkeeper_saves, away_goalkeeper_saves, home_attacks, away_attacks, \n     home_dangerous_attacks, away_dangerous_attacks, home_blocked_shots, away_blocked_shots\n     ]'

In [None]:
# # Deriving special metrics:

# import numpy as np

# # Shot creation ratio
# pl_data['home_shot_creation_ratio'] = np.where(
#     pl_data['home_chances'] != 0, 
#     pl_data['home_shots'] / pl_data['home_chances'], 
#     0
# )
# pl_data['away_shot_creation_ratio'] = np.where(
#     pl_data['away_chances'] != 0, 
#     pl_data['away_shots'] / pl_data['away_chances'], 
#     0
# )

# # Target rate
# pl_data['home_target_ratio'] = np.where(
#     pl_data['home_shots'] != 0, 
#     pl_data['home_shots_on_goal'] / pl_data['home_shots'], 
#     0
# )
# pl_data['away_target_ratio'] = np.where(
#     pl_data['away_shots'] != 0, 
#     pl_data['away_shots_on_goal'] / pl_data['away_shots'], 
#     0
# )

# # Conversion rate
# pl_data['home_conversion_rate'] = np.where(
#     pl_data['home_shots'] != 0, 
#     pl_data['home_goals'] / pl_data['home_shots'], 
#     0
# )
# pl_data['away_conversion_rate'] = np.where(
#     pl_data['away_shots'] != 0, 
#     pl_data['away_goals'] / pl_data['away_shots'], 
#     0
# )

# # Target to goal ratio
# pl_data['home_target_to_goal_ratio'] = np.where(
#     pl_data['home_shots_on_goal'] != 0, 
#     pl_data['home_goals'] / pl_data['home_shots_on_goal'], 
#     0
# )
# pl_data['away_target_to_goal_ratio'] = np.where(
#     pl_data['away_shots_on_goal'] != 0, 
#     pl_data['away_goals'] / pl_data['away_shots_on_goal'], 
#     0
# )

# # Add flags for zero values in critical ratios if you choose not to drop them
# pl_data['has_zero_ratios'] = (
#     (pl_data['home_shot_creation_ratio'] == 0) |
#     (pl_data['away_shot_creation_ratio'] == 0) |
#     (pl_data['home_target_ratio'] == 0) |
#     (pl_data['away_target_ratio'] == 0) |
#     (pl_data['home_conversion_rate'] == 0) |
#     (pl_data['away_conversion_rate'] == 0)
# )

# # Filter out rows with zero ratios
# pl_data = pl_data[pl_data['has_zero_ratios'] == False].copy()


In [46]:
# Create wide data frames to capture each team's historic data.

def load_team_season_data(team, season):
    """Load a specific team's season data"""
    file_path = f"../processed_data/premier_league/{team.lower().replace(' ', '_')}/{season}.json"
    with open(file_path, 'r') as f:
        return json.load(f)

def create_matchup_data(season_df, season):
    """
    Create wide format matchup data using historical stats from team files.
    Each row represents a match with both teams' historical performance.
    """
    matchups = []
    
    for _, match in season_df.iterrows():
        round_num = int(match['round'])
        home_team = match['home_team']
        away_team = match['away_team']
        
        # Skip first round since there's no historical data
        if round_num > 1:
            # Load teams' season data
            home_season = load_team_season_data(home_team, season)
            away_season = load_team_season_data(away_team, season)
            
            # Get previous round's stats (historical data before this match)
            prev_round = str(round_num - 1)
            if prev_round in home_season['rounds'] and prev_round in away_season['rounds']:
                home_history = home_season['rounds'][prev_round]
                away_history = away_season['rounds'][prev_round]
                
                matchup_data = {
                    'season': season,
                    'round': round_num,
                    'home_team': home_team,
                    'away_team': away_team,
                    'home_team_code': match['home_team_code'],
                    'away_team_code': match['away_team_code'],
                    
                    # Match result (for training)
                    'actual_home_goals': match['home_goals'],
                    'actual_away_goals': match['away_goals'],
                    'actual_result': (1 if match['home_goals'] > match['away_goals'] 
                                    else 2 if match['home_goals'] < match['away_goals'] 
                                    else 0),
                    
                    # Home team's historical stats
                    'home_goals_avg': home_history['goals_rolling_avg'],
                    'home_conceded_avg': home_history['conceded_goals_rolling_avg'],
                    'home_possession_avg': home_history['possession_rolling_avg'],
                    'home_shots_avg': home_history['shots_rolling_avg'],
                    'home_shots_conceded_avg': home_history['conceded_shots_rolling_avg'],
                    'home_shots_on_target_avg': home_history['shots_on_goal_rolling_avg'],
                    'home_shots_on_target_conceded_avg': home_history['conceded_shots_on_goal_rolling_avg'],
                    'home_goalkeeper_saves_avg': home_history['goalkeeper_saves_rolling_avg'],
                    'home_shot_creation_ratio_avg': home_history['shot_creation_ratio_rolling_avg'],
                    'home_target_ratio_avg': home_history['target_ratio_rolling_avg'],
                    'home_conversion_rate_avg': home_history['conversion_rate_rolling_avg'],
                    'home_target_to_goal_ratio_avg': home_history['target_to_goal_ratio_rolling_avg'],
                    
                    # Away team's historical stats
                    'away_goals_avg': away_history['goals_rolling_avg'],
                    'away_conceded_avg': away_history['conceded_goals_rolling_avg'],
                    'away_possession_avg': away_history['possession_rolling_avg'],
                    'away_shots_avg': away_history['shots_rolling_avg'],
                    'away_shots_conceded_avg': away_history['conceded_shots_rolling_avg'],
                    'away_shots_on_target_avg': away_history['shots_on_goal_rolling_avg'],
                    'away_shots_on_target_conceded_avg': away_history['conceded_shots_on_goal_rolling_avg'],
                    'away_goalkeeper_saves_avg': away_history['goalkeeper_saves_rolling_avg'],
                    'away_shot_creation_ratio_avg': away_history['shot_creation_ratio_rolling_avg'],
                    'away_target_ratio_avg': away_history['target_ratio_rolling_avg'],
                    'away_conversion_rate_avg': away_history['conversion_rate_rolling_avg'],
                    'away_target_to_goal_ratio_avg': away_history['target_to_goal_ratio_rolling_avg']
                }
                
                # Add comparative metrics
                matchup_data.update({
                    'goal_difference_avg': matchup_data['home_goals_avg'] - matchup_data['away_goals_avg'],
                    'conceded_difference_avg': matchup_data['home_conceded_avg'] - matchup_data['away_conceded_avg'],
                    'possession_difference_avg': matchup_data['home_possession_avg'] - matchup_data['away_possession_avg'],
                    'shots_difference_avg': matchup_data['home_shots_avg'] - matchup_data['away_shots_avg'],
                    'shot_conversion_difference_avg': matchup_data['home_conversion_rate_avg'] - matchup_data['away_conversion_rate_avg']
                })
                
                matchups.append(matchup_data)
    
    return pd.DataFrame(matchups)

In [47]:
# Process a single season
season = '2021_2022'
season_df = pd.read_json(f'../season_data/game_data_premier_league_{season}.json')

# Create matchup data
matchup_df = create_matchup_data(pl_data, season)

# Save matchup data
matchup_df.to_csv(f'../processed_data/premier_league/matchups_{season}.csv', index=False)

print(f"Created matchup data with shape: {matchup_df.shape}")
print("\nFeatures available for modeling:")
feature_cols = [col for col in matchup_df.columns if '_avg' in col]
print(feature_cols)

Created matchup data with shape: (370, 38)

Features available for modeling:
['home_goals_avg', 'home_conceded_avg', 'home_possession_avg', 'home_shots_avg', 'home_shots_conceded_avg', 'home_shots_on_target_avg', 'home_shots_on_target_conceded_avg', 'home_goalkeeper_saves_avg', 'home_shot_creation_ratio_avg', 'home_target_ratio_avg', 'home_conversion_rate_avg', 'home_target_to_goal_ratio_avg', 'away_goals_avg', 'away_conceded_avg', 'away_possession_avg', 'away_shots_avg', 'away_shots_conceded_avg', 'away_shots_on_target_avg', 'away_shots_on_target_conceded_avg', 'away_goalkeeper_saves_avg', 'away_shot_creation_ratio_avg', 'away_target_ratio_avg', 'away_conversion_rate_avg', 'away_target_to_goal_ratio_avg', 'goal_difference_avg', 'conceded_difference_avg', 'possession_difference_avg', 'shots_difference_avg', 'shot_conversion_difference_avg']
