In [1]:
import pandas as pd
import json

In [2]:
!wget https://www.football-data.co.uk/new/BRA.csv

--2025-05-26 21:14:31--  https://www.football-data.co.uk/new/BRA.csv
Resolving www.football-data.co.uk (www.football-data.co.uk)... 217.160.0.246
Connecting to www.football-data.co.uk (www.football-data.co.uk)|217.160.0.246|:443... 

connected.
HTTP request sent, awaiting response... 200 OK
Length: 558200 (545K) [text/csv]
Saving to: ‘BRA.csv’


2025-05-26 21:14:33 (515 KB/s) - ‘BRA.csv’ saved [558200/558200]



Transformando o nome dos times para valores numéricos

In [3]:
def map_names_to_ids(df):
    """
        Receives a DataFrame with the Brasileirao data
        Combines all the teams' names from Home and Away and set them to an unique ID
    """
    # Combine both columns to get all unique names
    all_names = pd.concat([df['Home'], df['Away']]).unique()
    all_names = sorted(all_names)

    # Mapping the id to a name
    mapped_names = {name: i for i, name in enumerate(all_names)}
    df['Home'] = df['Home'].map(mapped_names)
    df['Away'] = df['Away'].map(mapped_names)

    return df, mapped_names

def save_mapped_ids(mapped_names, target_csv):
    # Saving the mapping of the teams IDs
    file_name = target_csv[:-4] + ".json"
    with open(file_name, "w") as file:
        json.dump(mapped_names, file, indent=4)


def add_new_columns(df: pd.DataFrame):
    for H_A in ["H", "A"]:
        df[f'W_{H_A}'] = 0
        df[f'D_{H_A}'] = 0
        df[f'L_{H_A}'] = 0
        df[f'GF_{H_A}'] = 0
        df[f'GA_{H_A}'] = 0
        df[f'GD_{H_A}'] = 0
    
    return df


def resetTeamStatisticsDict(mapped_names, teamStatistics):
    
    for idTeam in list(mapped_names.values()):
        teamStatistics[idTeam] = {
            'W': 0,  # Wons
            'D': 0,  # Drawn
            'L': 0,  # Loses
            'GF': 0, # Goals For
            'GA': 0, # Goals Against
            'GD': 0  # Goal Difference
        }


def updateStatistics(teamStatistics, home, away, goals_home, goals_away):
    """
        Function to update the team statistics with data from the previous round
    """

    teamStatistics[home]['GF'] += goals_home
    teamStatistics[home]['GA'] += goals_away
    teamStatistics[home]['GD'] += goals_home - goals_away

    teamStatistics[away]['GF'] += goals_away
    teamStatistics[away]['GA'] += goals_home
    teamStatistics[away]['GD'] += goals_away - goals_home

    if goals_home > goals_away:
        teamStatistics[home]['W'] += 1
        teamStatistics[away]['L'] += 1

    elif goals_away > goals_home:
        teamStatistics[home]['L'] += 1
        teamStatistics[away]['W'] += 1

    else:
        teamStatistics[home]['D'] += 1
        teamStatistics[away]['D'] += 1


def make_statistics(df: pd.DataFrame, mapped_names: dict):
    """

    """
    df = add_new_columns(df)

    teamStatistics = {}
    resetTeamStatisticsDict(mapped_names, teamStatistics)
    
    current_year = df.iloc[0]["Season"]
    
    for index, row in df.iterrows():
        if row["Season"] != current_year:
            current_year = row["Season"]
            resetTeamStatisticsDict(mapped_names, teamStatistics)
        
        home_id = row["Home"]
        away_id = row["Away"]
        
        # Saving the statistics from the season until the current game
        df.loc[index, "W_H"] = teamStatistics[home_id]['W']
        df.loc[index, "D_H"] = teamStatistics[home_id]['D']
        df.loc[index, "L_H"] = teamStatistics[home_id]['L']
        df.loc[index, "GF_H"] = teamStatistics[home_id]['GF']
        df.loc[index, "GA_H"] = teamStatistics[home_id]['GA']
        df.loc[index, "GD_H"] = teamStatistics[home_id]['GD']

        df.loc[index, "W_A"] = teamStatistics[away_id]['W']
        df.loc[index, "D_A"] = teamStatistics[away_id]['D']
        df.loc[index, "L_A"] = teamStatistics[away_id]['L']
        df.loc[index, "GF_A"] = teamStatistics[away_id]['GF']
        df.loc[index, "GA_A"] = teamStatistics[away_id]['GA']
        df.loc[index, "GD_A"] = teamStatistics[away_id]['GD']

        # Updating statistics
        home_goals = row["HG"]
        away_goals = row["AG"]

        updateStatistics(teamStatistics, home_id, away_id, home_goals, away_goals)

    return df


def update_csv(original_csv, target_csv):
    """
        Receives the CSV file with the Brasileirao data
        Updates the columns 'Home' and 'Away' from nominal values to numerical.
        It will attribue an ID to each team
    """
    
    df = pd.read_csv(original_csv)

    df, mapped_names = map_names_to_ids(df)

    save_mapped_ids(mapped_names, target_csv)
    
    # Removing the last three columns which have many empty values
    df = df.drop(["Country", "League", "Date", "Time", "BFECH", "BFECD", "BFECA"], axis=1)
    
    
    df = make_statistics(df, mapped_names)
    
    df.to_csv(target_csv, index=False)


update_csv("data/BRA.csv", "data/BRA-modified.csv")
