In [2]:
!pip install -q statsbombpy
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency reso

In [3]:
import datetime
import pandas as pd

from statsbombpy import sb

from tqdm import tqdm
import json
import random
from datasets import load_dataset

from huggingface_hub import notebook_login

import warnings
warnings.filterwarnings('ignore')

# Funções Auxiliares

In [4]:
def describe_event(row):
    """
    Converte uma linha (evento de futebol) em uma sentença curta em inglês, incluindo:
      - Tempo: "At minute X:YY"
      - Posição, jogador e equipe
      - Localização em campo (x, y), se disponível
      - Tipo do evento com detalhes relevantes
    A sentença resultante é encapsulada com as tags <event> e <action>,
    seguindo o formato:
      <event>[Descrição geral] <action>[Ação]</action></event>
    """
    # Informações básicas: time, jogador e posição
    team_name = row.get("team", "UnknownTeam")
    player_name = row.get("player", "UnknownPlayer")
    position_name = row.get("position", "UnknownPosition")

    # Tipo do evento
    event_type = row.get("type", "UnknownEvent")

    # Tempo do evento (minuto e segundo)
    minute = row.get("minute", 0)
    second = row.get("second", 0)

    # Localização no campo (coordenadas x, y)
    pitch_location = row.get("location", None)
    if isinstance(pitch_location, (list, tuple)) and len(pitch_location) == 2:
        x, y = pitch_location
        location_str = f" at coordinates (x={int(x)}, y={int(y)})"
    else:
        location_str = ""

    # Detalhes específicos do evento
    # Passe
    pass_outcome = row.get("pass_outcome", None)   # ex.: "Complete", "Incomplete"
    pass_cross = row.get("pass_cross", False)        # booleano: True se foi um cruzamento

    # Drible
    dribble_outcome = row.get("dribble_outcome", None)  # ex.: "Complete", "Failed"

    # Chute
    shot_outcome = row.get("shot_outcome", None)      # ex.: "Goal", "Off Target"
    shot_body_part = row.get("shot_body_part", None)    # ex.: "Head", "Right Foot"

    # Monta a parte geral da sentença (histórico do evento)
    history_str = (
        f"At minute {minute}:{second:02d}, {position_name} {player_name} from {team_name}"
        f"{location_str}"
    )

    # Lógica para definir o rótulo da ação, de acordo com o tipo do evento
    evt = str(event_type).lower()
    if evt == "pass":
        if pd.notnull(pass_outcome):
            action_detail = f"made a pass ({pass_outcome})"
        else:
            action_detail = "made a pass"
        if pass_cross:
            action_detail += " (cross)"
    elif evt == "dribble":
        action_detail = f"attempted a dribble ({dribble_outcome})" if pd.notnull(dribble_outcome) else "attempted a dribble"
    elif evt == "shot":
        part_info = f" using {shot_body_part.lower()}" if pd.notnull(shot_body_part) else ""
        if pd.notnull(shot_outcome):
            action_detail = f"took a shot{part_info} ({shot_outcome})"
        else:
            action_detail = f"took a shot{part_info}"
    else:
        action_detail = str(event_type)

    # Monta a sentença final, encapsulando com as tags <event> e <action>
    sentence = f"<event>{history_str} <action>{action_detail}</action></event>"

    return sentence

In [5]:
def gerar_sequencias_n_lances(df, n_lances=3):
    """
    Gera uma lista de tuplas (prompt, proximo_lance) para treinar um modelo de linguagem.

    - prompt: texto resultante da concatenação de n lances em sequência (histórico)
    - proximo_lance: texto do lance seguinte (rótulo) que o modelo deverá aprender a prever

    Nesta versão, utiliza-se uma janela de 3 lances (n_lances = 3).
    """
    sequencias = []

    # Ordena os eventos cronologicamente
    df = df.sort_values(by=["match_id", "period", "minute", "second", "index"],
                        ascending=True).reset_index(drop=True)

    # Percorre o DataFrame utilizando uma janela deslizante de tamanho n_lances
    for i in range(len(df) - n_lances):
        subset = df.iloc[i : i + n_lances]
        proximo = df.iloc[i + n_lances]

        # Converte cada lance da sequência para uma sentença com as tags
        lances_texto = [describe_event(row) for _, row in subset.iterrows()]
        prompt = "\n".join(lances_texto)

        lance_seguinte_texto = describe_event(proximo)

        sequencias.append((prompt, lance_seguinte_texto))

    return sequencias

In [6]:
def split_at_type(sentence):
    """
    Divide uma sentença em duas partes com base na tag <action>.

    Retorna um dicionário com:
      - "previous_part": parte da sentença até antes da abertura da tag <action>
      - "action_label": conteúdo dentro da tag <action> (sem as tags)

    Caso a tag <action> não seja encontrada, retorna a sentença inteira como 'previous_part'
    e 'action_label' vazia.
    """
    start_tag = "<action>"
    end_tag = "</action>"

    if start_tag not in sentence or end_tag not in sentence:
        return {
            "previous_part": sentence,
            "action_label": ""
        }

    # Divide a sentença para extrair o conteúdo entre as tags
    start_idx = sentence.find(start_tag)
    end_idx = sentence.find(end_tag)

    previous_part = sentence[:start_idx]
    action_label = sentence[start_idx + len(start_tag): end_idx].strip()

    return {
        "previous_part": previous_part,
        "action_label": action_label
    }

In [7]:
def convert_pairs_to_dicts(pairs):
    """
    Recebe uma lista de tuplas (history, next_event) e retorna uma lista de dicionários.

    Cada dicionário contém:
      - "history": o prompt completo dos eventos anteriores concatenado com a parte anterior da sentença do próximo evento
      - "action_label": o conteúdo extraído da tag <action> do próximo evento
    """
    results = []
    for (history, next_event) in pairs:
        splitted = split_at_type(next_event)
        event_dict = {
            "history": history + splitted["previous_part"],
            "action_label": splitted["action_label"]
        }
        results.append(event_dict)
    return results

# Organizar dados

In [8]:
# Definindo os parâmetros para La Liga
competition_id = 11
season_ids = [90, 42]  # 90: uma temporada; 42: outra temporada

# Coletando os dados das partidas para as duas temporadas
matches_list = []
for season_id in season_ids:
    matches_df = sb.matches(competition_id=competition_id, season_id=season_id)
    # Adicionando uma coluna 'season_id' para identificar a temporada, se ainda não existir
    matches_df['season_id'] = season_id
    matches_list.append(matches_df)

# Concatenando os DataFrames das duas temporadas
la_liga_matches = pd.concat(matches_list, ignore_index=True)

In [9]:
# Exibindo as primeiras linhas do DataFrame consolidado
print("Visualização inicial dos dados de partidas:")
print(la_liga_matches.shape)
la_liga_matches.head(2)

Visualização inicial dos dados de partidas:
(68, 23)


Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,...,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version,season_id
0,3773386,2020-10-31,21:00:00.000,Spain - La Liga,2020/2021,Deportivo Alavés,Barcelona,1,1,available,...,8,Regular Season,Estadio de Mendizorroza,,Pablo Javier Machín Díez,Ronald Koeman,1.1.0,2,2,90
1,3773565,2021-01-09,18:30:00.000,Spain - La Liga,2020/2021,Granada,Barcelona,0,4,available,...,18,Regular Season,Estadio Nuevo Los Cármenes,Ricardo De Burgos Bengoetxea,Diego Martínez Penas,Ronald Koeman,1.1.0,2,2,90


In [10]:
# Inicializando uma lista para armazenar os DataFrames de eventos de cada partida
all_events = []

# Iterando por cada partida para extrair os eventos
for match_id in la_liga_matches['match_id'].unique():
    #print(f"Coletando eventos da partida {match_id}...")
    events_df = sb.events(match_id=match_id)
    # Opcional: adicionar uma coluna com o match_id, caso não esteja presente
    events_df['match_id'] = match_id
    all_events.append(events_df)

In [11]:
# Concatenando os dados de eventos de todas as partidas
events_all = pd.concat(all_events, ignore_index=True)

In [12]:
print(events_all.shape)
events_all.head()

(268088, 113)


Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,...,goalkeeper_shot_saved_off_target,goalkeeper_shot_saved_to_post,shot_saved_off_target,shot_saved_to_post,block_save_block,dribble_no_touch,shot_redirect,shot_follows_dribble,goalkeeper_success_in_play,goalkeeper_lost_in_play
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


# Criar sequências

In [38]:
VERSION_DATASET = '5k'
NUM_SEQ = 5000
N_LANCES = 3

In [39]:
excluded_types = ["Starting XI", "Half Start", "Half End"]

all_sequences = []
n_lances = N_LANCES
MAX_SEQ = 1000
WITH_LIMIT = False

# Ordena os eventos cronologicamente
df_events = events_all.sort_values(by=["match_id", "period", "minute", "second", "index"],
                    ascending=True).reset_index(drop=True)

# Filtra os eventos indesejados
df_events = df_events[~df_events["type"].isin(excluded_types)]

for i in tqdm(range(len(df_events) - n_lances), desc="Processando Janelas"):
    subset = df_events.iloc[i : i + n_lances]
    proximo = df_events.iloc[i + n_lances]

    lances_texto = [describe_event(row) for _, row in subset.iterrows()]
    prompt = "\n".join(lances_texto)
    lance_seguinte_texto = describe_event(proximo)

    all_sequences.append((prompt, lance_seguinte_texto))

    if len(all_sequences) >= MAX_SEQ and WITH_LIMIT:
        print("Parando devido ao limite máximo de sequências.")
        break

Processando Janelas: 100%|██████████| 267405/267405 [02:11<00:00, 2034.76it/s]


In [None]:
# Exemplo de uso (assumindo que df_extract e sb foram definidos anteriormente e que df_extract contém
# as colunas "competition_id" e "season_id" para selecionar os jogos)
'''
excluded_types = ["Starting XI", "Half Start", "Half End"]
MAX_SEQ = 1000
WITH_LIMIT = False
all_sequences = []

for match_id in la_liga_matches['match_id'].unique():
    if len(all_sequences) >= MAX_SEQ and WITH_LIMIT:
        break

    # Carrega os eventos para a partida
    events_df = sb.events(match_id=match_id)

    # Ordena os eventos por período, minuto e segundo
    events_df = events_df.sort_values(by=["period", "minute", "second"]).reset_index(drop=True)

    # Filtra os eventos indesejados
    df_filtered = events_df[~events_df["type"].isin(excluded_types)]

    # Gera sequências com uma janela de 5 lances
    sequencias = gerar_sequencias_n_lances(df_filtered, n_lances=3)

    for seq in sequencias:
        all_sequences.append(seq)
        if len(all_sequences) >= MAX_SEQ and WITH_LIMIT:
            break
'''

In [40]:
len(all_sequences)

267405

In [41]:
sequences_sample = random.sample(all_sequences, NUM_SEQ)

len(sequences_sample)

5000

In [42]:
# Opcional: converte os pares (prompt, próximo_lance) em uma lista de dicionários para uso posterior
sequences_dicts = convert_pairs_to_dicts(sequences_sample)


In [43]:
print(len(sequences_dicts))
sequences_dicts[:3]

5000


[{'history': '<event>At minute 55:57, Left Center Back Clément Lenglet from Barcelona at coordinates (x=76, y=22) <action>made a pass (cross)</action></event>\n<event>At minute 55:58, Left Wing Anssumane Fati from Barcelona at coordinates (x=92, y=8) <action>Ball Receipt*</action></event>\n<event>At minute 55:58, Left Wing Anssumane Fati from Barcelona at coordinates (x=92, y=8) <action>Carry</action></event><event>At minute 56:01, Left Wing Anssumane Fati from Barcelona at coordinates (x=95, y=9) ',
  'action_label': 'made a pass (cross)'},
 {'history': '<event>At minute 18:15, Center Forward Lionel Andrés Messi Cuccittini from Barcelona at coordinates (x=63, y=62) <action>Carry</action></event>\n<event>At minute 18:21, Center Forward Lionel Andrés Messi Cuccittini from Barcelona at coordinates (x=77, y=49) <action>made a pass (cross)</action></event>\n<event>At minute 18:22, Left Midfield Martin Braithwaite Christensen from Barcelona at coordinates (x=84, y=35) <action>Ball Receipt*<

# Divisão Treino, Validação, Teste

In [44]:
random.shuffle(sequences_dicts)

# Define train/validation/test percentages
TRAIN_PCT = 0.8
VALID_PCT = 0.1

# Compute indices for slicing
train_end = int(len(sequences_dicts) * TRAIN_PCT)
valid_end = int(len(sequences_dicts) * (TRAIN_PCT + VALID_PCT))

# Split the data
train_data = sequences_dicts[:train_end]
valid_data = sequences_dicts[train_end:valid_end]
test_data  = sequences_dicts[valid_end:]

In [45]:
# Function to save each list of dictionaries as JSONL
def save_as_jsonl(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [46]:
# Save them into temporary JSONL files
save_as_jsonl(train_data, "train.jsonl")
save_as_jsonl(valid_data, "valid.jsonl")
save_as_jsonl(test_data,  "test.jsonl")

In [47]:
# Load them as Hugging Face datasets
data_files = {
    "train": "train.jsonl",
    "validation": "valid.jsonl",
    "test": "test.jsonl"
}
dataset = load_dataset("json", data_files=data_files)

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['history', 'action_label'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['history', 'action_label'],
        num_rows: 500
    })
    test: Dataset({
        features: ['history', 'action_label'],
        num_rows: 500
    })
})


In [48]:
dataset['train'][0]

{'history': '<event>At minute 62:16, Right Center Forward Ante Budimir from Mallorca at coordinates (x=49, y=45) <action>Pressure</action></event>\n<event>At minute 62:16, Left Center Back Clément Lenglet from Barcelona at coordinates (x=71, y=32) <action>made a pass (cross)</action></event>\n<event>At minute 62:17, Left Center Midfield Frenkie de Jong from Barcelona at coordinates (x=76, y=28) <action>Ball Receipt*</action></event><event>At minute 62:17, Left Center Midfield Frenkie de Jong from Barcelona at coordinates (x=76, y=28) ',
 'action_label': 'made a pass (cross)'}

# Upload Dataset to Hub

In [24]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [49]:
dataset.push_to_hub(f"football-events-statsbomb360-la-liga-{N_LANCES}-{VERSION_DATASET}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/muriloms/football-events-statsbomb360-la-liga-3-5k/commit/16479168b7d98e50bceea22803767f55e1cee3f5', commit_message='Upload dataset', commit_description='', oid='16479168b7d98e50bceea22803767f55e1cee3f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/muriloms/football-events-statsbomb360-la-liga-3-5k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='muriloms/football-events-statsbomb360-la-liga-3-5k'), pr_revision=None, pr_num=None)