# Import Packages

In [23]:
import pandas as pd
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from collections import Counter
import os

# Load Match Data

In [4]:
# Load the CSV file into a DataFrame
matches_csv_path = "Data/matches_1.csv"
matches_df = pd.read_csv(matches_csv_path, low_memory=False)

# Display the DataFrame
matches_df.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,away_player_11_penalties,away_player_11_marking,away_player_11_standing_tackle,away_player_11_sliding_tackle,away_player_11_gk_diving,away_player_11_gk_handling,away_player_11_gk_kicking,away_player_11_gk_positioning,away_player_11_gk_reflexes,match_result
0,1729,1729,1729,2008/2009,1,2008-08-17,489042,10260,10261,1,...,70.0,26.0,30.0,20.0,12.0,22.0,48.0,22.0,22.0,Draw
1,1730,1729,1729,2008/2009,1,2008-08-16,489043,9825,8659,1,...,57.0,22.0,31.0,20.0,10.0,22.0,35.0,22.0,22.0,Home Win
2,1731,1729,1729,2008/2009,1,2008-08-16,489044,8472,8650,0,...,89.0,21.0,20.0,29.0,10.0,20.0,51.0,20.0,20.0,Away Win
3,1732,1729,1729,2008/2009,1,2008-08-16,489045,8654,8528,2,...,65.0,60.0,45.0,52.0,9.0,21.0,47.0,21.0,21.0,Home Win
4,1733,1729,1729,2008/2009,1,2008-08-17,489046,10252,8456,4,...,53.0,43.0,50.0,46.0,1.0,21.0,51.0,21.0,21.0,Home Win


# Feature Engineering

## Get the last match results for home and away teams

In [7]:
# Sort values by date
matches_df.sort_values('date', inplace=True)

# Initialize columns for last match results
matches_df['home_team_last_match_result'] = None
matches_df['away_team_last_match_result'] = None
current_season = None

# Process each match row by row
for idx, row in matches_df.iterrows():
    home_team = row['home_team_api_id']
    away_team = row['away_team_api_id']
    result = row['match_result']
    season = row['season']

    # Check if season has changed
    if season != current_season:
        # Reset tracking for a new season
        current_season = season
        team_results = {}  # Clear previous season's results

    # Initialize the results list if the team is not in the dictionary (new team in this season)
    if home_team not in team_results:
        team_results[home_team] = []
    if away_team not in team_results:
        team_results[away_team] = []

    # Set the last match result for home team and away team
    # If no prior games, I keep it as 0
    matches_df.at[idx, 'home_team_last_match_result'] = (
        team_results[home_team][-1] if team_results[home_team] else 0)
    matches_df.at[idx, 'away_team_last_match_result'] = (
        team_results[away_team][-1] if team_results[away_team] else 0)

    # Update the team results with the current match result for future matches in the season
    if result == 'Home Win':
        team_results[home_team].append("Won")
        team_results[away_team].append("Lost")
    elif result == 'Away Win':
        team_results[home_team].append("Lost")
        team_results[away_team].append("Won")
    else:  # Draw
        team_results[home_team].append("Draw")
        team_results[away_team].append("Draw")

# Print the updated DataFrame to check the results
matches_df[['date', 'home_team_api_id', 'away_team_api_id', 
            'home_team_last_match_result', 'away_team_last_match_result']]

Unnamed: 0,date,home_team_api_id,away_team_api_id,home_team_last_match_result,away_team_last_match_result
3049,2008-08-09,9873,9853,0,0
3048,2008-08-09,9874,9855,0,0
3047,2008-08-09,9851,8592,0,0
3046,2008-08-09,8481,8639,0,0
3045,2008-08-09,9829,9847,0,0
...,...,...,...,...,...
2979,2016-05-15,8659,8650,Draw,Draw
11478,2016-05-15,8543,8535,Won,Draw
16576,2016-05-15,8370,8581,Lost,Won
16569,2016-05-15,9869,10205,Draw,Lost


## Get the last 3 games win/loss/draw

In [9]:
# Initialize columns to store recent performance metrics for home and away teams
matches_df['home_team_last_3_wins'] = 0
matches_df['home_team_last_3_losses'] = 0
matches_df['home_team_last_3_draws'] = 0
matches_df['away_team_last_3_wins'] = 0
matches_df['away_team_last_3_losses'] = 0
matches_df['away_team_last_3_draws'] = 0

# Process each match row by row
for idx, row in matches_df.iterrows():
    home_team = row['home_team_api_id']
    away_team = row['away_team_api_id']
    result = row['match_result']
    season = row['season']

    # Check if season has changed
    if season != current_season:
        # Reset tracking for a new season
        current_season = season
        team_results = {}  # Clear previous season's results

    # Initialize the results list if the team is not in the dictionary (new team in this season)
    if home_team not in team_results:
        team_results[home_team] = []
    if away_team not in team_results:
        team_results[away_team] = []

    
    # Get last 3 results for home and away teams
    if len(team_results[home_team]) >= 3:
        last_3_home_results = team_results[home_team][-3:]  # Last 3 games in the current season
        last_3_away_results = team_results[away_team][-3:]  # Last 3 games in the current season
    else:
        last_3_home_results = team_results[home_team]
        last_3_away_results = team_results[away_team]
    # Only calculate if there are at least 3 results, otherwise set to None
    if len(last_3_home_results) == 3:
        matches_df.at[idx, 'home_team_last_3_wins'] = last_3_home_results.count('Won') if last_3_home_results.count('Won') > 0 else 0
        matches_df.at[idx, 'home_team_last_3_losses'] = last_3_home_results.count('Lost') if last_3_home_results.count('Lost') > 0 else 0
        matches_df.at[idx, 'home_team_last_3_draws'] = last_3_home_results.count('Draw') if last_3_home_results.count('Draw') > 0 else 0
    elif len(last_3_home_results) < 3 and len(last_3_home_results) > 0:
        matches_df.at[idx, 'home_team_last_3_wins'] = last_3_home_results.count('Won') if last_3_home_results.count('Won') > 0 else 0
        matches_df.at[idx, 'home_team_last_3_losses'] = last_3_home_results.count('Lost') if last_3_home_results.count('Lost') > 0 else 0
        matches_df.at[idx, 'home_team_last_3_draws'] = last_3_home_results.count('Draw') if last_3_home_results.count('Draw') > 0 else 0
    else:
        matches_df.at[idx, 'home_team_last_3_wins'] = 0
        matches_df.at[idx, 'home_team_last_3_losses'] = 0
        matches_df.at[idx, 'home_team_last_3_draws'] = 0

    if len(last_3_away_results) == 3:
        matches_df.at[idx, 'away_team_last_3_wins'] = last_3_away_results.count('Won') if last_3_away_results.count('Won') > 0 else 0
        matches_df.at[idx, 'away_team_last_3_losses'] = last_3_away_results.count('Lost') if last_3_away_results.count('Lost') > 0 else 0
        matches_df.at[idx, 'away_team_last_3_draws'] = last_3_away_results.count('Draw') if last_3_away_results.count('Draw') > 0 else 0
    elif len(last_3_away_results) < 3 and len(last_3_away_results) > 0:
        matches_df.at[idx, 'away_team_last_3_wins'] = last_3_away_results.count('Won') if last_3_away_results.count('Won') > 0 else 0
        matches_df.at[idx, 'away_team_last_3_losses'] = last_3_away_results.count('Lost') if last_3_away_results.count('Lost') > 0 else 0
        matches_df.at[idx, 'away_team_last_3_draws'] = last_3_away_results.count('Draw') if last_3_away_results.count('Draw') > 0 else 0
    else:
        matches_df.at[idx, 'away_team_last_3_wins'] = 0
        matches_df.at[idx, 'away_team_last_3_losses'] = 0
        matches_df.at[idx, 'away_team_last_3_draws'] = 0

    # Update the team results with the current match result for future matches in the season
    if result == 'Home Win':
        team_results[home_team].append("Won")
        team_results[away_team].append("Lost")
    elif result == 'Away Win':
        team_results[home_team].append("Lost")
        team_results[away_team].append("Won")
    else:  # Draw
        team_results[home_team].append("Draw")
        team_results[away_team].append("Draw")

# Print the DataFrame to check results
matches_df[['date', 'home_team_api_id', 'away_team_api_id', 
                      'home_team_last_3_wins', 'home_team_last_3_losses', 'home_team_last_3_draws',
                      'away_team_last_3_wins', 'away_team_last_3_losses', 'away_team_last_3_draws']]

Unnamed: 0,date,home_team_api_id,away_team_api_id,home_team_last_3_wins,home_team_last_3_losses,home_team_last_3_draws,away_team_last_3_wins,away_team_last_3_losses,away_team_last_3_draws
3049,2008-08-09,9873,9853,0,0,0,0,0,0
3048,2008-08-09,9874,9855,0,0,0,0,0,0
3047,2008-08-09,9851,8592,0,0,0,0,0,0
3046,2008-08-09,8481,8639,0,0,0,0,0,0
3045,2008-08-09,9829,9847,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
2979,2016-05-15,8659,8650,0,1,2,1,1,1
11478,2016-05-15,8543,8535,2,1,0,0,1,2
16576,2016-05-15,8370,8581,0,3,0,1,1,1
16569,2016-05-15,9869,10205,1,1,1,1,1,1


## Ratios of averaged player stats for home vs. away

In [11]:
# Define the function to process each match
def calculate_ratios_for_match(row):
    idx, match = row
    home_stats = {attr: [] for attr in attributes}
    away_stats = {attr: [] for attr in attributes}

    # Collect home team stats for each attribute (player_1 is goal keeper)
    for i in range(2, 12):
        for attr in attributes:
            home_stats[attr].append(match[f'home_player_{i}_{attr}'])
            away_stats[attr].append(match[f'away_player_{i}_{attr}'])

    # Calculate mean values for each attribute for home and away teams
    home_means = {attr: pd.Series(home_stats[attr]).mean() for attr in attributes}
    away_means = {attr: pd.Series(away_stats[attr]).mean() for attr in attributes}

    # Calculate the ratio of home to away team stats for each attribute
    ratios = {}
    for attr in attributes:
        if away_means[attr] != 0:  # Avoid division by zero
            ratios[f'{attr}_ratio'] = home_means[attr] / away_means[attr]
        else:
            ratios[f'{attr}_ratio'] = None  # Set to None if division by zero

    return idx, ratios

# Player attributes to calculate ratios for
attributes = [
    'overall_rating', 'potential', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 
    'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 
    'acceleration', 'sprint_speed', 'agility','reactions', 'balance', 'shot_power', 'jumping', 
    'stamina', 'strength', 'long_shots', 'aggression','interceptions', 'positioning', 'vision', 
    'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 
    'gk_kicking', 'gk_positioning', 'gk_reflexes'
]

# Initialize columns to store ratio features
for attr in attributes:
    matches_df[f'{attr}_ratio'] = 0.0

# Use ThreadPoolExecutor to parallelize the processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(calculate_ratios_for_match, row): row[0] for row in matches_df.iterrows()}

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Calculating ratios"):
        idx, ratios = future.result()
        for attr, value in ratios.items():
            matches_df.at[idx, attr] = value

# Display the ratio columns
matches_df[[f'{attr}_ratio' for attr in attributes]].head()

Calculating ratios: 100%|████████████████| 16637/16637 [00:23<00:00, 716.52it/s]


Unnamed: 0,overall_rating_ratio,potential_ratio,crossing_ratio,finishing_ratio,heading_accuracy_ratio,short_passing_ratio,volleys_ratio,dribbling_ratio,curve_ratio,free_kick_accuracy_ratio,...,vision_ratio,penalties_ratio,marking_ratio,standing_tackle_ratio,sliding_tackle_ratio,gk_diving_ratio,gk_handling_ratio,gk_kicking_ratio,gk_positioning_ratio,gk_reflexes_ratio
3049,0.93733,0.957606,0.903481,0.932727,1.015408,0.888112,0.901745,0.89426,0.937997,0.961818,...,0.963391,0.899123,0.842491,0.914634,0.934087,0.916667,1.174603,1.111546,1.193548,1.193548
3048,1.065649,1.103107,1.062278,1.023952,1.097844,0.989583,0.900972,1.06446,1.054229,0.820604,...,0.909233,1.149406,1.063025,0.996212,0.954229,1.011905,0.995098,1.10387,1.040404,1.025641
3047,0.898722,0.921543,0.992188,0.920769,0.976912,0.946529,0.957198,1.035484,1.04943,0.965336,...,0.959627,0.84433,0.892653,0.947929,1.056485,0.667742,1.092135,1.126337,1.130233,1.123699
3046,0.933694,0.938725,0.964516,0.871126,0.918605,0.844648,1.003175,0.910769,0.907064,0.809826,...,0.913881,0.946824,0.880645,0.887324,0.654267,0.932584,0.931818,0.911765,0.959091,0.922727
3045,0.955405,0.968593,0.826625,0.932806,0.953917,0.878116,0.802198,0.750374,0.761364,0.863063,...,0.803596,0.831412,0.86964,0.863866,0.86201,2.260274,1.151961,0.846026,1.166667,1.234694


## Ratio of summed goal keeper stats

In [13]:
# Goalkeeper attributes to sum
goalkeeper_attributes = ['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']

# Initialize columns for storing the ratio of goalkeeper stats
matches_df['goalkeeper_stats_ratio'] = 0.0

# Calculate the sum of each goalkeeper's stats and the ratio
def calculate_goalkeeper_stat_ratio(row):
    # Sum the goalkeeper stats for 'home_player_1' and 'away_player_1'
    home_gk_sum = sum(row[f'home_player_1_{attr}'] for attr in goalkeeper_attributes)
    away_gk_sum = sum(row[f'away_player_1_{attr}'] for attr in goalkeeper_attributes)

    # Calculate the ratio of home to away stats, avoiding division by zero
    if away_gk_sum != 0:
        return home_gk_sum / away_gk_sum
    else:
        return None  # Handle cases where away_gk_sum is zero

# Apply the function to calculate the ratio for each match
matches_df['goalkeeper_stats_ratio'] = matches_df.apply(calculate_goalkeeper_stat_ratio, axis=1)

# Display the resulting dataframe with goalkeeper ratio columns
print(matches_df[['goalkeeper_stats_ratio']])

       goalkeeper_stats_ratio
3049                 1.069519
3048                 1.041667
3047                 0.889688
3046                 1.154762
3045                 0.335821
...                       ...
2979                 1.082645
11478                1.237179
16576                0.994521
16569                1.026455
2973                 1.156334

[16637 rows x 1 columns]


## Ratios of team attributes for home vs away

In [15]:
# Integer attributes to calculate home vs. away ratios
int_attributes = [
    'buildUpPlaySpeed', 'buildUpPlayPassing', 'chanceCreationPassing',
    'chanceCreationCrossing', 'chanceCreationShooting', 'defencePressure',
    'defenceAggression', 'defenceTeamWidth'
]

# Initialize columns to store ratio features
for attr in int_attributes:
    matches_df[f'{attr}_ratio'] = 0.0

# Calculate the ratio for each integer team attribute
for attr in int_attributes:
    home_col = f'home_{attr}'
    away_col = f'away_{attr}'

    # Calculate the ratio of home to away for the attribute
    matches_df[f'{attr}_ratio'] = matches_df.apply(
        lambda row: row[home_col] / row[away_col] if row[away_col] != 0 else None,
        axis=1
    )

# Display the ratio columns
matches_df[[f'{attr}_ratio' for attr in int_attributes]]

Unnamed: 0,buildUpPlaySpeed_ratio,buildUpPlayPassing_ratio,chanceCreationPassing_ratio,chanceCreationCrossing_ratio,chanceCreationShooting_ratio,defencePressure_ratio,defenceAggression_ratio,defenceTeamWidth_ratio
3049,,,,,,,,
3048,,,,,,,,
3047,,,,,,,,
3046,,,,,,,,
3045,,,,,,,,
...,...,...,...,...,...,...,...,...
2979,0.939394,1.133333,1.558824,1.470588,1.304348,0.803922,0.769231,0.803279
11478,2.000000,1.666667,1.000000,1.568627,1.358491,1.017241,1.118644,1.065574
16576,0.467742,0.676923,0.619718,1.047619,0.723077,1.500000,1.575758,1.500000
16569,1.145833,1.388889,1.722222,1.134615,0.690909,0.957447,1.125000,0.931818


## Ranking of the teams in the league

In [17]:
# Loop through each season and league
for season in matches_df['season'].unique():
    for league in matches_df[matches_df['season'] == season]['league_id'].unique():
        # Filter the DataFrame for the current season and league
        season_league_df = matches_df[(matches_df['season'] == season) & (matches_df['league_id'] == league)]

        # Initialize Team Rankings DataFrame
        Team_rankings_df = pd.DataFrame(columns=[
            'rank', 'points', 'goals_scored', 'goals_conceded', 'goal_difference', 
            'matches_played', 'wins', 'draws', 'losses'
        ])
        Team_rankings_df.index.name = 'team_id'

        # Populate rankings for each match
        for idx, match in season_league_df.iterrows():
            home_team = match['home_team_api_id']
            away_team = match['away_team_api_id']
            home_goals = match['home_team_goal']
            away_goals = match['away_team_goal']
            result = match['match_result']

            # Initialize teams in the ranking table if not present
            for team in [home_team, away_team]:
                if team not in Team_rankings_df.index:
                    Team_rankings_df.loc[team] = {
                        'rank': 0, 'points': 0, 'goals_scored': 0, 
                        'goals_conceded': 0, 'goal_difference': 0, 
                        'matches_played': 0, 'wins': 0, 'draws': 0, 'losses': 0
                    }

            # Update pre-match statistics
            matches_df.at[idx, 'home_team_rank'] = Team_rankings_df.loc[home_team, 'rank']
            matches_df.at[idx, 'away_team_rank'] = Team_rankings_df.loc[away_team, 'rank']
            matches_df.at[idx, 'home_team_points'] = Team_rankings_df.loc[home_team, 'points']
            matches_df.at[idx, 'away_team_points'] = Team_rankings_df.loc[away_team, 'points']
            matches_df.at[idx, 'home_team_goals_scored'] = Team_rankings_df.loc[home_team]['goals_scored']
            matches_df.at[idx, 'away_team_goals_scored'] = Team_rankings_df.loc[away_team]['goals_scored']
            matches_df.at[idx, 'home_team_goals_conceded'] = Team_rankings_df.loc[home_team]['goals_conceded']
            matches_df.at[idx, 'away_team_goals_conceded'] = Team_rankings_df.loc[away_team]['goals_conceded']
            matches_df.at[idx, 'home_team_goal_difference'] = Team_rankings_df.loc[home_team]['goal_difference']
            matches_df.at[idx, 'away_team_goal_difference'] = Team_rankings_df.loc[away_team]['goal_difference']
            matches_df.at[idx, 'home_team_matches_played'] = Team_rankings_df.loc[home_team]['matches_played']
            matches_df.at[idx, 'away_team_matches_played'] = Team_rankings_df.loc[away_team]['matches_played']
            matches_df.at[idx, 'home_team_wins'] = Team_rankings_df.loc[home_team]['wins']
            matches_df.at[idx, 'away_team_wins'] = Team_rankings_df.loc[away_team]['wins']
            matches_df.at[idx, 'home_team_draws'] = Team_rankings_df.loc[home_team]['draws']
            matches_df.at[idx, 'away_team_draws'] = Team_rankings_df.loc[away_team]['draws']
            matches_df.at[idx, 'home_team_losses'] = Team_rankings_df.loc[home_team]['losses']
            matches_df.at[idx, 'away_team_losses'] = Team_rankings_df.loc[away_team]['losses']

            # Update statistics after the match
            Team_rankings_df.at[home_team, 'goals_scored'] += home_goals
            Team_rankings_df.at[home_team, 'goals_conceded'] += away_goals
            Team_rankings_df.at[home_team, 'goal_difference'] = (
                Team_rankings_df.loc[home_team, 'goals_scored'] - Team_rankings_df.loc[home_team, 'goals_conceded']
            )
            Team_rankings_df.at[home_team, 'matches_played'] += 1

            Team_rankings_df.at[away_team, 'goals_scored'] += away_goals
            Team_rankings_df.at[away_team, 'goals_conceded'] += home_goals
            Team_rankings_df.at[away_team, 'goal_difference'] = (
                Team_rankings_df.loc[away_team, 'goals_scored'] - Team_rankings_df.loc[away_team, 'goals_conceded']
            )
            Team_rankings_df.at[away_team, 'matches_played'] += 1

            # Update points and outcomes
            if result == 'Home Win':
                Team_rankings_df.at[home_team, 'points'] += 3
                Team_rankings_df.at[home_team, 'wins'] += 1
                Team_rankings_df.at[away_team, 'losses'] += 1
            elif result == 'Away Win':
                Team_rankings_df.at[away_team, 'points'] += 3
                Team_rankings_df.at[away_team, 'wins'] += 1
                Team_rankings_df.at[home_team, 'losses'] += 1
            else:  # Draw
                Team_rankings_df.at[home_team, 'points'] += 1
                Team_rankings_df.at[away_team, 'points'] += 1
                Team_rankings_df.at[home_team, 'draws'] += 1
                Team_rankings_df.at[away_team, 'draws'] += 1

            # Update ranks
            Team_rankings_df.sort_values(
                by=['points', 'goal_difference', 'goals_scored'], 
                ascending=False, inplace=True
            )
            Team_rankings_df['rank'] = range(1, len(Team_rankings_df) + 1)

# Display the rank columns
matches_df[['home_team_rank', 'away_team_rank']]

Unnamed: 0,home_team_rank,away_team_rank
3049,0.0,0.0
3048,0.0,0.0
3047,0.0,0.0
3046,0.0,0.0
3045,0.0,0.0
...,...,...
2979,15.0,8.0
11478,8.0,5.0
16576,19.0,20.0
16569,18.0,4.0


## Get Team Formations

In [19]:
def formation_extractor(match_api_id_ = None):
    """
    Extracts the formations and number of defenders, midfielders, and attackers for both home and away teams in a given match.

    Parameters:
    match_api_id_ (int): The match API ID to extract the formations for.

    Returns:
    tuple: Contains the formations and number of defenders, midfielders, and attackers for both home and away teams.
    """
    def squad_extractor(match_api_id_ = None):
        """
        Extracts the player positions for both home and away teams in a given match.

        Parameters:
        match_api_id_ (int): The match API ID to extract the player positions for.

        Returns:
        tuple: Contains the y-coordinates of the player positions for both home and away teams.
        """
        # Initialize lists to store player IDs and positions
        home_players_api_id = list()
        away_players_api_id = list()
        home_players_x = list()
        away_players_x = list()
        home_players_y = list()
        away_players_y = list()

        # Extract player IDs and positions for home and away teams
        for i in range(1, 12):
            home_players_api_id.append(matches_df[matches_df["match_api_id"] == match_api_id_][f'home_player_{i}'].values[0])
            away_players_api_id.append(matches_df[matches_df["match_api_id"] == match_api_id_][f'away_player_{i}'].values[0])
            home_players_x.append(matches_df[matches_df["match_api_id"] == match_api_id_][f'home_player_X{i}'].values[0])
            away_players_x.append(matches_df[matches_df["match_api_id"] == match_api_id_][f'away_player_X{i}'].values[0])
            home_players_y.append(matches_df[matches_df["match_api_id"] == match_api_id_][f'home_player_Y{i}'].values[0])
            away_players_y.append(matches_df[matches_df["match_api_id"] == match_api_id_][f'away_player_Y{i}'].values[0])
        return home_players_y, away_players_y

    # Extract player positions for both teams
    players_y = squad_extractor(match_api_id_)
    formations = [None] * 2

    # Determine formations for both teams
    for i in range(2):
        formation_dict = Counter(players_y[i])
        sorted_keys = sorted(formation_dict)
        formation = ''
        for key in sorted_keys[1:-1]:
            y = formation_dict[key]
            formation += '%d-' % y
        formation += '%d' % formation_dict[sorted_keys[-1]]
        formations[i] = formation

    # Extract formations and number of defenders, midfielders, and attackers for both teams
    home_team_formation_ = formations[0]
    away_team_formation_ = formations[1]
    home_team_num_defenders_ = int(home_team_formation_.split('-')[0])
    home_team_num_midfielders_ = sum(int(x) for x in home_team_formation_.split('-')[1:-1])
    home_team_num_attackers_ = int(home_team_formation_.split('-')[-1])
    away_team_num_defenders_ = int(away_team_formation_.split('-')[0])
    away_team_num_midfielders_ = sum(int(x) for x in away_team_formation_.split('-')[1:-1])
    away_team_num_attackers_ = int(away_team_formation_.split('-')[-1])

    return (home_team_formation_, away_team_formation_, home_team_num_defenders_, 
            home_team_num_midfielders_, home_team_num_attackers_, away_team_num_defenders_, 
            away_team_num_midfielders_, away_team_num_attackers_)

# Define the function to process each match
def extract_formation_for_match(row):
    idx, match = row
    match_api_id = match['match_api_id']
    (home_team_formation, away_team_formation, home_team_num_defenders, 
     home_team_num_midfielders, home_team_num_attackers, away_team_num_defenders, 
     away_team_num_midfielders, away_team_num_attackers) = formation_extractor(match_api_id)
    
    return idx, {
        "home_team_formation": home_team_formation,
        "away_team_formation": away_team_formation,
        "home_team_num_defenders": home_team_num_defenders,
        "home_team_num_midfielders": home_team_num_midfielders,
        "home_team_num_attackers": home_team_num_attackers,
        "away_team_num_defenders": away_team_num_defenders,
        "away_team_num_midfielders": away_team_num_midfielders,
        "away_team_num_attackers": away_team_num_attackers
    }

# Ensure the necessary columns exist in the DataFrame
matches_df['home_team_formation'] = None
matches_df['away_team_formation'] = None
matches_df['home_team_num_defenders'] = None
matches_df['home_team_num_midfielders'] = None
matches_df['home_team_num_attackers'] = None
matches_df['away_team_num_defenders'] = None
matches_df['away_team_num_midfielders'] = None
matches_df['away_team_num_attackers'] = None

# Use ThreadPoolExecutor to parallelize the processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(extract_formation_for_match, row): row[0] for row in matches_df.iterrows()}

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Extracting formations"):
        idx, formations = future.result()
        for key, value in formations.items():
            matches_df.at[idx, key] = value

# Convert to integer
columns_to_convert = [
    'home_team_num_defenders', 'home_team_num_midfielders', 'home_team_num_attackers',
    'away_team_num_defenders', 'away_team_num_midfielders', 'away_team_num_attackers'
]

for column in columns_to_convert:
    matches_df[column] = matches_df[column].astype(int)

# Display the formation columns
matches_df[['home_team_formation', 'away_team_formation', 'home_team_num_defenders', 
                'home_team_num_midfielders', 'home_team_num_attackers', 'away_team_num_defenders', 
                'away_team_num_midfielders', 'away_team_num_attackers']].head()

Extracting formations: 100%|██████████████| 16637/16637 [09:08<00:00, 30.34it/s]


Unnamed: 0,home_team_formation,away_team_formation,home_team_num_defenders,home_team_num_midfielders,home_team_num_attackers,away_team_num_defenders,away_team_num_midfielders,away_team_num_attackers
3049,4-4-2,4-5-1,4,4,2,4,5,1
3048,4-4-2,4-4-2,4,4,2,4,4,2
3047,4-4-2,4-4-2,4,4,2,4,4,2
3046,4-5-1,4-3-3,4,5,1,4,3,3
3045,4-4-2,4-4-2,4,4,2,4,4,2


# Save to CSV

In [25]:
# Create the directory
output_dir = "Data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the matches_df DataFrame to a CSV file
output_path = os.path.join(output_dir, "matches_2.csv")
matches_df.to_csv(output_path, index=False)

print(f"matches_df has been saved to {output_path}")

matches_df has been saved to Data/matches_2.csv
