In [None]:
!pip install -q statsbombpy
#!pip install -q datasets
#!pip install -q huggingface_hub

In [None]:
import datetime
import pandas as pd

from statsbombpy import sb

import json
import random
from datasets import load_dataset

from huggingface_hub import notebook_login

import warnings
warnings.filterwarnings('ignore')

# Criar sequencias

In [None]:
def describe_event(row):
    """
    Converts a single row (football event) into a short English sentence,
    including:
      - The player's role (e.g., "Striker", "Defender")
      - The pitch location (x, y) of the event, if available
      - The event type and relevant details (pass, shot, etc.)
    """
    # Team, player, and role/position
    team_name = row.get("team", "UnknownTeam")
    player_name = row.get("player", "UnknownPlayer")
    position_name = row.get("position", "UnknownPosition")

    # Event type
    event_type = row.get("type", "UnknownEvent")

    # Time (minute & second)
    minute = row.get("minute", 0)
    second = row.get("second", 0)

    # (x, y) coordinates on the pitch
    pitch_location = row.get("location", None)
    # This is often something like [x, y], or it could be missing / NaN

    # Pass details
    pass_outcome = row.get("pass_outcome", None)   # e.g., "Complete", "Incomplete"
    pass_cross = row.get("pass_cross", False)      # boolean: True if cross

    # Dribble details
    dribble_outcome = row.get("dribble_outcome", None)  # e.g., "Complete"

    # Shot details
    shot_outcome = row.get("shot_outcome", None)      # e.g., "Goal"
    shot_body_part = row.get("shot_body_part", None)  # e.g., "Head", "Right Foot"

    # Format the pitch location text
    if isinstance(pitch_location, (list, tuple)) and len(pitch_location) == 2:
        x, y = pitch_location
        location_str = f" at coordinates (x={int(x)}, y={int(y)})"
    else:
        location_str = ""

    # Base sentence
    # Example: "At minute 10:15, Striker Alice from Team A at coordinates (x=55, y=30)"
    sentence = (
        f"At minute {minute}:{second:02d}, "
        f"{position_name} {player_name} from {team_name}"
        f"{location_str}"
    )

    # Event-specific logic
    evt = str(event_type).lower()

    if evt == "pass":
        # Check if it was successful or not
        if pd.isnull(pass_outcome):
            detail = "made a pass"
        else:
            detail = f"made a pass ({pass_outcome})"

        # Indicate if it was a cross
        if pass_cross:
            detail += " (cross)"

        sentence += f" performed an event of type: {detail}."

    elif evt == "dribble":
        if pd.isnull(dribble_outcome):
            detail = "attempted a dribble"
        else:
            detail = f"attempted a dribble ({dribble_outcome})"

        sentence += f" performed an event of type:  {detail}."

    elif evt == "shot":
        part_info = f" using {shot_body_part.lower()}" if pd.notnull(shot_body_part) else ""
        if pd.isnull(shot_outcome):
            detail = f"took a shot{part_info}"
        else:
            detail = f"took a shot{part_info} ({shot_outcome})"

        sentence += f" performed an event of type:  {detail}."

    else:
        # Generic event type
        sentence += f" performed an event of type: {event_type}."

    return sentence


In [None]:
def gerar_sequencias_n_lances(df, n_lances=5):
    """
    Gera uma lista de (prompt, proximo_lance) para treinar um modelo de linguagem.

    - prompt = texto combinando n lances em sequência
    - proximo_lance = texto do lance seguinte (que o modelo deve aprender a prever)
    """
    sequencias = []

    # Garantir que está ordenado
    df = df.sort_values(by=["match_id", "period", "minute", "second", "index"], ascending=True).reset_index(drop=True)

    # Percorrer com janela de n_lances
    for i in range(len(df) - n_lances):
        # Extrair os n lances
        subset = df.iloc[i : i + n_lances]
        # Extrair o próximo lance (rótulo)
        proximo = df.iloc[i + n_lances]

        # Construir o prompt com n lances
        # Você pode concatenar em uma única string, usar "\n" ou qualquer outro separador.
        lances_texto = [describe_event(row) for _, row in subset.iterrows()]
        prompt = "\n".join(lances_texto)

        # Lance que o modelo deverá prever
        lance_seguinte_texto = describe_event(proximo)

        sequencias.append((prompt, lance_seguinte_texto))

    return sequencias


In [None]:
df_extract = df_europe_last5_male[df_europe_last5_male['country_name']=='Spain']

In [None]:
excluded_types = ["Starting XI", "Half Start", "Half End"]

# Limite máximo de sequências
MAX_SEQ = 2000

# Lista para armazenar todas as sequências
all_sequences = []

for idx, row in df_extract.iterrows():
    comp_id = row['competition_id']
    season_id = row['season_id']

    # Carregar os matches desta competição/temporada
    matches_df = sb.matches(competition_id=comp_id, season_id=season_id)

    for match_id in matches_df['match_id'].unique():
        # Se já atingimos o limite, interrompe o loop
        if len(all_sequences) >= MAX_SEQ:
            break

        # Carregar os eventos para esse match_id
        events_df = sb.events(match_id=match_id)

        # Ordenar por período, minuto e segundo
        events_df = events_df.sort_values(by=["period", "minute", "second"]).reset_index(drop=True)

        # Filtrar os tipos que não interessam
        df_filtered = events_df[~events_df["type"].isin(excluded_types)]

        # Gerar as sequências (n_lances = 5 no seu exemplo)
        sequencias = gerar_sequencias_n_lances(df_filtered, n_lances=5)

        # Podemos adicionar cada sequência separadamente,
        # incluindo metadados (competition, season, match_id)
        for seq in sequencias:
            all_sequences.append(seq)
            # Se atingiu o limite, interrompe
            if len(all_sequences) >= MAX_SEQ:
                break

    # Verifica se já atingimos o limite para sair também deste loop externo
    if len(all_sequences) >= MAX_SEQ:
        break


In [None]:
len(all_sequences)

2000

In [None]:
def split_at_type(sentence):
    """
    Splits a sentence around 'type:'.
    Returns a dict:
      {
        "previous_part": <string up to 'type:'>,
        "action_label": <string after 'type:' (punctuation removed)>
      }
    If 'type:' is not found, we return the entire sentence as 'previous_part'
    and 'action_label' is empty.
    """
    marker = "type:"

    if marker not in sentence:
        return {
            "previous_part": sentence,
            "action_label": ""
        }

    # Split on "type:" once, from the left
    parts = sentence.split(marker, 1)
    # parts[0] = everything before "type:"
    # parts[1] = everything after "type:"

    previous_part = parts[0] + "type:"  # re-append the marker
    # Clean up trailing punctuation (commonly a period) in parts[1]
    action_label = parts[1].strip().rstrip(".")

    return {
        "previous_part": previous_part,
        "action_label": action_label
    }


In [None]:
def convert_pairs_to_dicts(pairs):
    """
    Takes a list of (history, next_event) tuples and returns a list of dictionaries.
    Each dictionary has:
      - "history": the full prompt of previous events
      - "action_label": the substring of the next event after "type:"
    """
    results = []
    for (history, next_event) in pairs:
        # Parse the next_event to find the action label
        splitted = split_at_type(next_event)

        # Create our final dictionary
        event_dict = {
            "history": history + "\n" +splitted["previous_part"],
            "action_label": splitted["action_label"]
        }
        results.append(event_dict)

    return results


In [None]:
converted = convert_pairs_to_dicts(all_sequences)

In [None]:
converted[:5]

[{'history': 'At minute 0:00, Center Forward Antoine Griezmann from Barcelona at coordinates (x=61, y=40) performed an event of type: made a pass (cross).\nAt minute 0:01, Right Defensive Midfield Sergio Busquets i Burgos from Barcelona at coordinates (x=50, y=45) performed an event of type: Ball Receipt*.\nAt minute 0:01, Right Defensive Midfield Sergio Busquets i Burgos from Barcelona at coordinates (x=49, y=44) performed an event of type: made a pass (cross).\nAt minute 0:02, Left Defensive Midfield Frenkie de Jong from Barcelona at coordinates (x=47, y=29) performed an event of type: Ball Receipt*.\nAt minute 0:02, Left Defensive Midfield Frenkie de Jong from Barcelona at coordinates (x=47, y=29) performed an event of type: Carry.\nAt minute 0:05, Left Defensive Midfield Frenkie de Jong from Barcelona at coordinates (x=48, y=29) performed an event of type:',
  'action_label': 'made a pass (cross)'},
 {'history': 'At minute 0:01, Right Defensive Midfield Sergio Busquets i Burgos fro

# Criar Dataset

In [None]:
# --------------------------------------------------
# 1) Suppose your final list of dictionaries is `my_data`.
#    For example: my_data = [
#       {"history": "text of previous events", "action_label": "Carry"},
#       {"history": "...", "action_label": "Shot"},
#       ...
#    ]
# --------------------------------------------------

my_data = converted # Replace with your actual data list of dicts

# Optional: Shuffle data if you want random splits
random.shuffle(my_data)

# --------------------------------------------------
# 2) Define train/validation/test percentages
# --------------------------------------------------
TRAIN_PCT = 0.8
VALID_PCT = 0.1

# 3) Compute indices for slicing
train_end = int(len(my_data) * TRAIN_PCT)
valid_end = int(len(my_data) * (TRAIN_PCT + VALID_PCT))

# 4) Split the data
train_data = my_data[:train_end]
valid_data = my_data[train_end:valid_end]
test_data  = my_data[valid_end:]


In [None]:
# --------------------------------------------------
# 5) Function to save each list of dictionaries as JSONL
# --------------------------------------------------
def save_as_jsonl(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
# 6) Save them into temporary JSONL files
save_as_jsonl(train_data, "train.jsonl")
save_as_jsonl(valid_data, "valid.jsonl")
save_as_jsonl(test_data,  "test.jsonl")

In [None]:
# --------------------------------------------------
# 7) Load them as Hugging Face datasets
# --------------------------------------------------
data_files = {
    "train": "train.jsonl",
    "validation": "valid.jsonl",
    "test": "test.jsonl"
}
dataset = load_dataset("json", data_files=data_files)

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['history', 'action_label'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['history', 'action_label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['history', 'action_label'],
        num_rows: 200
    })
})


In [None]:
dataset['train'][0]

{'history': 'At minute 37:25, Left Center Midfield José Ignacio Peleteiro Ramallo from Deportivo Alavés at coordinates (x=85, y=44) performed an event of type: Ball Recovery.\nAt minute 37:25, Left Center Midfield José Ignacio Peleteiro Ramallo from Deportivo Alavés at coordinates (x=85, y=44) performed an event of type: Carry.\nAt minute 37:26, Left Center Back Clément Lenglet from Barcelona at coordinates (x=30, y=37) performed an event of type: Pressure.\nAt minute 37:27, Left Center Midfield José Ignacio Peleteiro Ramallo from Deportivo Alavés at coordinates (x=91, y=41) performed an event of type: Dispossessed.\nAt minute 37:27, Left Center Back Clément Lenglet from Barcelona at coordinates (x=28, y=38) performed an event of type: Duel.\nAt minute 37:27, Left Center Back Clément Lenglet from Barcelona at coordinates (x=28, y=38) performed an event of type:',
 'action_label': 'Carry'}

# Upload Dataset to Hub

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
dataset.push_to_hub("tcc-mba-v1-dataset-mini")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/muriloms/tcc-mba-v1-dataset-mini/commit/cc07073880d11a99f4c20eaeb390a2a7404aa92a', commit_message='Upload dataset', commit_description='', oid='cc07073880d11a99f4c20eaeb390a2a7404aa92a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/muriloms/tcc-mba-v1-dataset-mini', endpoint='https://huggingface.co', repo_type='dataset', repo_id='muriloms/tcc-mba-v1-dataset-mini'), pr_revision=None, pr_num=None)