# Import Packages

In [2]:
import os
import pandas as pd

# Load Match Data

In [4]:
# Load the CSV file into a DataFrame
matches_csv_path = "Data/matches_2.csv"
matches_df = pd.read_csv(matches_csv_path, low_memory=False)

# Display the DataFrame
matches_df.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,home_team_losses,away_team_losses,home_team_formation,away_team_formation,home_team_num_defenders,home_team_num_midfielders,home_team_num_attackers,away_team_num_defenders,away_team_num_midfielders,away_team_num_attackers
0,4778,4769,4769,2008/2009,1,2008-08-09,483138,9873,9853,1,...,0.0,0.0,4-4-2,4-5-1,4,4,2,4,5,1
1,4777,4769,4769,2008/2009,1,2008-08-09,483137,9874,9855,1,...,0.0,0.0,4-4-2,4-4-2,4,4,2,4,4,2
2,4776,4769,4769,2008/2009,1,2008-08-09,483136,9851,8592,4,...,0.0,0.0,4-4-2,4-4-2,4,4,2,4,4,2
3,4775,4769,4769,2008/2009,1,2008-08-09,483135,8481,8639,0,...,0.0,0.0,4-5-1,4-3-3,4,5,1,4,3,3
4,4774,4769,4769,2008/2009,1,2008-08-09,483134,9829,9847,1,...,0.0,0.0,4-4-2,4-4-2,4,4,2,4,4,2


# Data Cleaning

In [13]:
def process_match_data(df):
    """
    Preprocess the matches DataFrame by performing the following steps:
    1. Drop specific in-game event columns and rank-related columns.
    2. Drop columns with more than 400 missing values.
    3. Drop rows with any missing values.
    4. Remove rows with invalid formations for home and away teams.

    Input: df (pd.DataFrame): The input DataFrame to preprocess.

    Output: pd.DataFrame: The preprocessed DataFrame.
    """
    # Drop in-game events and rank-related columns
    columns_to_drop = [
        'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 
        'possession', 'id', 'home_team_goal', 'away_team_goal', 'date', 
        'goal_difference', 'home_team_points', 'away_team_points', 
        'home_team_goals_scored', 'away_team_goals_scored', 'home_team_goals_conceded', 
        'away_team_goals_conceded', 'home_team_goal_difference', 
        'away_team_goal_difference', 'home_team_matches_played', 'away_team_matches_played', 
        'home_team_wins', 'away_team_wins', 'home_team_draws', 'away_team_draws', 
        'home_team_losses', 'away_team_losses'
    ]
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    # Drop columns with more than 400 missing values
    df.drop(columns=[col for col in df.columns if df[col].isnull().sum() > 400], inplace=True)

    # Drop rows with any missing values
    df.dropna(inplace=True)

    # Define invalid formations for home and away teams
    invalid_home_formations = ['1-1-1-1-1-1-1-1-1-1', '6-3', '4-2-2-1']
    invalid_away_formations = ['1-1-1-1-1-1-1-1-1-1', '2-3-1-1', '3-3-1-1-1', '3-2-1-1']

    # Remove rows with invalid formations
    df = df[~df["home_team_formation"].isin(invalid_home_formations)]
    df = df[~df["away_team_formation"].isin(invalid_away_formations)]

    return df

# Apply the function to matches_df
matches_df = process_match_data(matches_df)

# Display the result
matches_df

Unnamed: 0,country_id,league_id,season,stage,match_api_id,home_team_api_id,away_team_api_id,home_player_X1,home_player_X2,home_player_X3,...,home_team_rank,away_team_rank,home_team_formation,away_team_formation,home_team_num_defenders,home_team_num_midfielders,home_team_num_attackers,away_team_num_defenders,away_team_num_midfielders,away_team_num_attackers
3,4769,4769,2008/2009,1,483135,8481,8639,1.0,4.0,6.0,...,0.0,0.0,4-5-1,4-3-3,4,5,1,4,3,3
4,4769,4769,2008/2009,1,483134,9829,9847,1.0,2.0,4.0,...,0.0,0.0,4-4-2,4-4-2,4,4,2,4,4,2
9,4769,4769,2008/2009,1,483133,9748,9941,1.0,2.0,4.0,...,0.0,0.0,4-3-3,4-3-3,4,3,3,4,3,3
10,7809,7809,2008/2009,1,499317,9823,9790,1.0,2.0,4.0,...,0.0,0.0,4-1-2-1-2,4-3-3,4,4,2,4,3,3
20,4769,4769,2008/2009,2,483146,9847,9827,1.0,2.0,4.0,...,19.0,6.0,4-5-1,4-4-2,4,5,1,4,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16630,10257,10257,2015/2016,38,2060636,10233,8524,1.0,3.0,5.0,...,10.0,13.0,3-5-2,4-2-3-1,3,5,2,4,5,1
16631,10257,10257,2015/2016,38,2060637,8533,9857,1.0,2.0,4.0,...,9.0,14.0,4-3-1-2,4-2-3-1,4,4,2,4,5,1
16632,1729,1729,2015/2016,38,1987606,8659,8650,1.0,2.0,4.0,...,15.0,8.0,4-2-3-1,4-3-3,4,5,1,4,3,3
16633,10257,10257,2015/2016,38,2060639,8543,8535,1.0,2.0,4.0,...,8.0,5.0,4-3-3,3-4-2-1,4,3,3,3,6,1


# Save to CSV

In [16]:
# Create the directory
output_dir = "Data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the matches_df DataFrame to a CSV file
output_path = os.path.join(output_dir, "matches_3.csv")
matches_df.to_csv(output_path, index=False)

print(f"matches_df has been saved to {output_path}")

matches_df has been saved to Data/matches_3.csv
