<a href="https://colab.research.google.com/github/oraziotorre/MomentumShiftAI/blob/main/TableTennisDataGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import asyncio
import ast
import os
import pandas as pd
import re
import requests
import pandas as pd
import os
from collections import Counter
import numpy as np
from io import StringIO

# **DATA INGESTION**

In [None]:
!pip install playwright
!playwright install
from playwright.async_api import async_playwright, Page

Prendo delle informazioni da un dataset presente su GitHub e faccio scraping sul sito della federazione per aggiungere le sequenze di punteggio ai dati che abbiamo già a disposizione

In [None]:
async def get_console_logs(page: Page, event_id: str, doc_code: str, n_games: int):
    logs = []

    def handle_console_message(msg):
        # Controlla se il log della console è del tipo [.........]
        if re.match(r'^\[.*\]$', msg.text):
            logs.append(f"{msg.text}")

    # Ascolta gli eventi di log della console
    page.on("console", handle_console_message)

    try:
        # Naviga al sito usando gli identificativi forniti
        url = f"https://worldtabletennis.com/PostMatchCenter?eventId={event_id}&docCode={doc_code}"
        print(f"Navigating to: {url}")
        await page.goto(url)

        # Aspetta che la pagina finisca di caricarsi
        await page.wait_for_load_state("networkidle")

        # Interagisce con i bottoni per ottenere i dati dei game G
        for i in range(2, n_games + 1):  # Itera da G2 a Gn(G1 già è stampato)
            game_label = f"G{i}"
            buttons = page.locator("span.tabHeader", has_text=game_label)
            count = await buttons.count()

            if count == 1:
                # Se c'è un solo bottone, cliccalo
                await buttons.nth(0).click()
            elif count > 1:
                # Se ci sono più bottoni, clicca sul secondo
                await buttons.nth(1).click()
            else:
                # Nessun bottone trovato
                print(f"No '{game_label}' buttons found.")

    except Exception as e:
        print(f"Error navigating to {url}: {e}")

    finally:
        # Salva i log in un file o stampa a schermo
        if len(logs) > 2:
            logs_path = f"matches/{event_id}_{doc_code}_console_logs.txt"
            os.makedirs(os.path.dirname(logs_path), exist_ok=True)

            with open(logs_path, "w", encoding="utf-8") as log_file:
                log_file.write(f"{n_games}\n")
                log_file.write("\n".join(logs))

            print(f"Match stats saved to: {logs_path}")



async def process_files(page: Page):
    """Elabora un file TSV da URL, esegue operazioni sui dati e lo sposta nella cartella OK_tournaments."""

    base_path = "data/tournaments"

    ######## RANGE DEI TORNEI PARTE DA X PERCHE' I TORNEI PRECEDENTI SONO STATI GIA' ELABORATI
    for file_number in range(3134,3200):
      file_name = f"{file_number}.tsv"
      file_url = f"https://raw.githubusercontent.com/daimeng/chiquita/master/data/wtt_cleaned/matches/{file_name}"
      file_path = os.path.join(base_path, file_name)
      ok_path = os.path.join("data", "OK_tournaments", file_name)

      # Creazione delle cartelle se non esistono
      os.makedirs(os.path.dirname(file_path), exist_ok=True)
      os.makedirs(os.path.dirname(ok_path), exist_ok=True)

      # Download del file
      response = requests.get(file_url)
      if response.status_code == 200:
          with open(file_path, "w", encoding="utf-8") as f:
              f.write(response.text)

          print(f"Processing file: {file_path}")

          # Lettura del file TSV
          df = pd.read_csv(file_path, sep='\t')

          for _, row in df.iterrows():
              event_id = row.iloc[0]
              doc_code = row.iloc[1]
              n_games = int(row.iloc[12]) + int(row.iloc[13])
              await get_console_logs(page, str(event_id), str(doc_code), n_games)

          # Spostamento del file nella cartella OK
          os.rename(file_path, ok_path)


async def main():
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=True)
        page = await browser.new_page()

        # Elabora i file TSV
        await process_files(page)

        await browser.close()


if __name__ == "__main__":
    await(main())

Unisco tutte le informazioni acquisite in un unico csv grezzo

In [None]:
# Funzione per validare la correttezza dei punteggi inseriti nel csv
def check_points_error(points_a, points_x):

    """

        Questa funzione tenta di rimuovere il più rumore possibile nei dati,
        con l'obiettivo di individuare i casi in cui i punteggi non sono validi

    """

    len_a = len(points_a)
    len_x = len(points_x)

    # Un set non può avere meno di 11 punti di gioco(minimo 11-0)
    if len_a < 11 or len_x < 11:
        return True

    # Un set deve avere lo stesso numero di istanti per entrambi i set dei giocatori
    if len(points_a) != len(points_x):
        return True

    # Un set deve finire con almeno uno dei due giocatori a 11 punti minimo
    if points_a[-1] < 11 and points_x[-1] < 11:
        return True

    # Un set deve finire con almeno due punti di distacco
    if abs(points_a[-1] - points_x[-1]) < 2:
        return True

    for i in range(1, len_a):

        # Un set non può avere un balzo di più di un punto fra un istante e un altro
        if abs(points_a[i - 1] - points_a[i]) > 1:
            return True

        if abs(points_x[i - 1] - points_x[i]) > 1:
            return True

        # Un set non può avere un punteggio decrescente
        if points_a[i] < points_a[i - 1]:
            return True
        elif points_x[i] < points_x[i - 1]:
            return True

        # Se un giocatore fa un punto in un istante "i" allora l'altro giocatore non farà punto
        if points_a[i] > points_a[i - 1]:
            if points_x[i] > points_x[i - 1]:
                return True

        if points_x[i] > points_x[i - 1]:
            if points_a[i] > points_a[i - 1]:
                return True

        # Se un giocatore non fa un punto in un istante "i" allora l'altro giocatore non può non fare un punto
        if points_a[i] == points_a[i - 1]:
            if points_x[i] == points_x[i - 1]:
                return True

        if points_x[i] == points_x[i - 1]:
            if points_a[i] == points_a[i - 1]:
                return True

    return False


def points_transformer(points_a, index):

    """

        Consideriamo i punti fatti da un giocatore come il numero
        di istanti in cui il giocatore si trova al punteggio i

    """

    frequency = Counter(points_a)
    result = []
    for i, val in enumerate(sorted(frequency.keys())):
        if i >= index:
            break
        result.append(frequency[val])

    return result


def process_match_log(file_path):

    """

        Questa funzione legge i file di .txt generati dallo scraper download_matches.py
        e genera le righe del dataset rimuovendo i dati errati

    """

    with open(file_path, 'r') as file:
        lines = file.readlines()

    num_sets = int(lines[0].strip())
    match_data = []  # Conterrà le righe da mettere nel dataset
    match_state = [0, 0]  # Stato iniziale dei set del match
    match_points_a = 0  # Numero di punti vinti da "a"
    match_points_x = 0  # Numero di punti vinti da "x"

    for i in range(num_sets):
        try:
            # Prende i puteggi del set "i" presenti nel file .txt considerato
            points_a = eval(lines[2 + i * 2].strip()) if lines[2 + i * 2].strip() else []
            points_x = eval(lines[1 + i * 2].strip()) if lines[1 + i * 2].strip() else []

            # Se nei punteggi è presente un errore allora genera una eccezione
            if check_points_error(points_a, points_x):
                raise ValueError("Errore nei punteggi")

            match_points_a += points_a[-1]  # Prendo il numero di punti fatti da a
            match_points_x += points_x[-1]  # Prendo il numero di punti fatti da x

            # Aggiunge al dataset temporaneo i punteggi dei giocatori e lo stato attuale dei set
            match_data.append({
                "points_a": points_a[1:-1] if len(points_a) > 2 else [],    # Non considero il primo e l'ultimo istante dei punteggi
                "points_x": points_x[1:-1] if len(points_x) > 2 else [],    # Perchè il primo istante è sempre 0 mentre l'ultimo ci dice il risultato finale della partita(DATA LEAKAGE!)
                "match_state": f"{match_state[0]}-{match_state[1]}"
            })

            # In base ai punteggi vedo chi ha vinto il set
            if points_a[-1] > points_x[-1]:
                match_state[0] += 1  # A vince il set
            else:
                match_state[1] += 1  # X vince il set

        except Exception as e:
            # In caso di errore, aggiungo dati vuoti per il set corrente e blocco l'analisi della partita incriminata
            match_data.append({
                "points_a": "",
                "points_x": "",
                "match_state": f"{match_state[0]}-{match_state[1]}"
            })
            break


    # Se il match ha punteggi troppo contrastanti allora lo tolgo dal dataset
    # Assumo che uno dei giocatori deve fare almeno 7 punti in tutta la partita per considerare il match come "equilibrato"
    if match_points_a < 7 or match_points_x < 7:
        match_data = []

    return match_data


def process_file(file_path, skip_header):
    """
    Funzione per leggere e processare un singolo file .tsv.
    """
    try:
        # Leggo il CSV contenente i dati su tutte le partite di un torneo tranne i punteggi che dovrò prendere dai file .txt
        data = pd.read_csv(file_path, sep='\t', skiprows=0 if skip_header else 0)

        all_match_data = []  # Conterrà tutte le righe del torneo analizzato

        # Per ciascuna partita del torneo inserisce il all_match_data le righe complete
        for idx, row in data.iterrows():
            event_id = row['event_id']
            doc = row['doc']
            filename_to_search = f"{event_id}_{doc}_console_logs.txt"
            matches_dir = 'matches'
            file_path_in_matches = os.path.join(matches_dir, filename_to_search)

            if os.path.exists(file_path_in_matches):
                print(f"File trovato: {file_path_in_matches}")
                match_data = process_match_log(file_path_in_matches)

                if match_data:
                    sets_to_win = max(row['res_a'], row['res_x'])  # Calcola i set necessari per vincere
                    for set_data in match_data:
                        all_match_data.append({
                            "event_id": event_id,
                            "match_id": doc,
                            "match_format": row['fmt'],
                            "players_gender": row['gender'],
                            "match_stage": row['stage'],
                            "stage_id": row['stage_id'],
                            "match_duration": row['duration'],
                            "match_start_time": row['start'],
                            "player_id": row['a_id'],
                            "player_2_id": row['b_id'],
                            "opponent_id": row['x_id'],
                            "opponent_2_id": row['y_id'],
                            "player_sets_won": row['res_a'],
                            "opponent_sets_won": row['res_x'],
                            "match_scores": row['scores'],
                            "sets_required_to_win": sets_to_win,
                            "current_match_state": set_data["match_state"],
                            "points_progression": set_data["points_a"],
                            "opponent_points": set_data["points_x"],
                        })

        return pd.DataFrame(all_match_data)

    except Exception as e:
        print(f"Errore durante la lettura del file {file_path}: {e}")
        return pd.DataFrame()


def main():
    tournaments_dir = 'data/OK_tournaments'
    output_file = 'raw_dataset.csv'

    if not os.path.exists(tournaments_dir):
        print(f"Errore: La cartella '{tournaments_dir}' non esiste.")
        return

    first_file = True

    for filename in os.listdir(tournaments_dir):
        if filename.endswith('.tsv'):
            file_path = os.path.join(tournaments_dir, filename)
            data = process_file(file_path, not first_file)

            if not data.empty:
                data.to_csv(output_file, mode='a', index=False, header=first_file)
                first_file = False


if __name__ == '__main__':
    main()

File trovato: matches/3069_TTEWDOUBLES-----------FNL-000100----------_console_logs.txt
File trovato: matches/3069_TTEMSINGLES-----------FNL-000100----------_console_logs.txt
File trovato: matches/3069_TTEMDOUBLES-----------FNL-000100----------_console_logs.txt
File trovato: matches/3069_TTEXDOUBLES-----------FNL-000100----------_console_logs.txt
File trovato: matches/3069_TTEWSINGLES-----------FNL-000100----------_console_logs.txt
File trovato: matches/3069_TTEWDOUBLES-----------SFNL000100----------_console_logs.txt
File trovato: matches/3069_TTEMDOUBLES-----------SFNL000100----------_console_logs.txt
File trovato: matches/3069_TTEWSINGLES-----------SFNL000200----------_console_logs.txt
File trovato: matches/3069_TTEWDOUBLES-----------SFNL000200----------_console_logs.txt
File trovato: matches/3069_TTEXDOUBLES-----------SFNL000100----------_console_logs.txt
File trovato: matches/3069_TTEMSINGLES-----------SFNL000200----------_console_logs.txt
File trovato: matches/3069_TTEWSINGLES-----

# **DATA PREPROCESSING**

In [224]:
# Importiamo il dataset ottenuto tramite le operazioni di scraping
dataset = pd.read_csv("/content/raw_dataset_TT .csv")

dataset

  dataset = pd.read_csv("/content/raw_dataset_TT .csv")


Unnamed: 0,event_id,match_id,match_format,players_gender,match_stage,stage_id,match_duration,match_start_time,player_id,player_2_id,opponent_id,opponent_2_id,player_sets_won,opponent_sets_won,match_scores,sets_required_to_win,current_match_state,points_progression,opponent_points
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,0-0,"[0, 0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 6, 6, 6, ...","[1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, ..."
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,0-1,"[1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, ...","[0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, ..."
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,1-1,"[1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 6, 6, 7, ...","[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 7, 8, 8, ..."
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,2-1,"[1, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 6, ...","[0, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 9, ..."
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,2-2,"[1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, ...","[0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,M,GP04,600,1136,2025-04-09 04:20:00,123962,,133850,,3,0,"11-6,11-6,11-7",3,2-0,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 8, 9, 9, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 5, 6, ..."
74735,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,133850,,3,1,"11-4,11-8,7-11,11-7",3,0-0,"[0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10]","[1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4]"
74736,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,133850,,3,1,"11-4,11-8,7-11,11-7",3,1-0,"[1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 8, 9, ...","[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 6, 6, ..."
74737,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,133850,,3,1,"11-4,11-8,7-11,11-7",3,2-0,"[1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 7, ...","[0, 0, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 8, ..."


## **Etichettatura**
Aggiungiamo al dataset una nuova colonna, denominata `set_result`, che indicherà il risultato di ciascun set per ogni partita:

- `1` per identificare la classe dei Vincitori del set.
- `2` per identificare la classe degli Sconfitti del set.

Questa etichettatura risulta particolarmente utile per l'addestramento di modelli come LSTM e Regressione Logistica, in cui l'obiettivo è prevedere, con una determinata probabilità, se un giocatore vincerà o perderà il set in base alle altre caratteristiche presenti nel dataset.

In [225]:
# Funzione per convertire i punteggi da stringa a lista di interi
def convert_to_int_list(points_str):
    if isinstance(points_str, str):
        try:
            return [int(x) for x in points_str.strip('[]').split(',')]
        except ValueError:
            return None
    return None


# Funzione per determinare il risultato del set
def calculate_set_result(points_progression, opponent_points):
    if points_progression:
        if points_progression[-1] > opponent_points[-1]:
            return 1  # Vittoria per il giocatore
        else:
            return 2  # Sconfitta per il giocatore
    else:
        return -1  # Dati non conformi


set_results = []

dataset['points_progression'] = dataset['points_progression'].apply(convert_to_int_list)
dataset['opponent_points'] = dataset['opponent_points'].apply(convert_to_int_list)

for index, row in dataset.iterrows():
    result = calculate_set_result(row['points_progression'], row['opponent_points'])
    set_results.append(result)

# Aggiungo la nuova colonna 'set_result' al DataFrame
dataset['SetWinner'] = set_results

dataset




Unnamed: 0,event_id,match_id,match_format,players_gender,match_stage,stage_id,match_duration,match_start_time,player_id,player_2_id,opponent_id,opponent_2_id,player_sets_won,opponent_sets_won,match_scores,sets_required_to_win,current_match_state,points_progression,opponent_points,SetWinner
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,0-0,"[0, 0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 6, 6, 6, ...","[1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, ...",2
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,0-1,"[1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, ...","[0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, ...",1
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,1-1,"[1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 6, 6, 7, ...","[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 7, 8, 8, ...",1
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,2-1,"[1, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 6, ...","[0, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 9, ...",2
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,117294,109777.0,3,2,"8-11,11-8,11-9,7-11,11-8",3,2-2,"[1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, ...","[0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, ...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,M,GP04,600,1136,2025-04-09 04:20:00,123962,,133850,,3,0,"11-6,11-6,11-7",3,2-0,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 8, 9, 9, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 5, 6, ...",1
74735,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,133850,,3,1,"11-4,11-8,7-11,11-7",3,0-0,"[0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10]","[1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4]",1
74736,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,133850,,3,1,"11-4,11-8,7-11,11-7",3,1-0,"[1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 8, 9, ...","[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 6, 6, ...",1
74737,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,133850,,3,1,"11-4,11-8,7-11,11-7",3,2-0,"[1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 7, ...","[0, 0, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 8, ...",2


## **Analisi dei dati**

Esaminiamo la qualità e la correttezza dei dati, valutandone l'affidabilità e identificando eventuali incongruenze o anomalie che potrebbero influenzare l'addestramento.

### **Controllo del bilanciamento delle classi**
Un primo passo nell'analisi del dataset consiste nel verificare se le classi presenti siano bilanciate.

Per farlo, analizziamo la distribuzione delle istanze con valore `1` e `0` nella colonna `set_result`.

In [226]:
# Numero di elementi per la classe "Vincitori"
print(len(dataset[(dataset['SetWinner'] == 1)]))
# Numero di elementi per la classe "Sconfitti"
print(len(dataset[(dataset['SetWinner'] == 0)]))

# Totale
print(len(dataset))


36610
0
74739


Notiamo che abbiamo un dataset quasi perfettamente bilanciato e di conseguenza non sono necessari ulteriori accorgimenti

### **Analisi delle caratteristiche e delle distribuzioni dei dati**
Usiamo `describe()` per avere una overview sulle distribuzioni dei dati numerici

In [227]:
# Verifico la correttezza dei campi numerici
dataset.describe()

Unnamed: 0,SetWinner
count,74739.0
mean,1.407685
std,0.66816
min,-1.0
25%,1.0
50%,1.0
75%,2.0
max,2.0


Notiamo che nella colonna `set_required_to_win`, si riscontrano dei valori rari (`2` e `5`)  che potrebbero non essere rilevanti per il nostro obiettivo di analisi.

Questi valori potrebbero introdurre rumore o distorcere le inferenze, soprattutto perché la loro frequenza è estremamente bassa rispetto agli altri valori.

### **Controllo dei valori nulli**

Verificare la presenza di valori nulli all'interno del dataset.

In [228]:
# Identificare i valori NaN nel dataset
nan_mask = dataset.isna()

# Conta i valori NaN (valori mancanti) per ciascuna colonna del dataset
nan_count = nan_mask.sum()

# Verifico i valori null del raw_dataset
nan_count

Unnamed: 0,0
event_id,0
match_id,0
match_format,0
players_gender,0
match_stage,0
stage_id,0
match_duration,0
match_start_time,0
player_id,0
player_2_id,56494


Le colonne `points_progression` e `opponent_points` contengono **2490** valori nulli, che richiedono un intervento per la gestione.

Le colonne `player_2_id` e `opponent_2_id` presentano invece **54539** valori nulli, ma ciò è normale, in quanto queste colonne riguardano il secondo giocatore di una squadra, quindi sono nulle nelle partite di singolo.


### **Numero di partite**

Verifico il numero di partite considerate nel dataset

In [229]:
dataset[['event_id', 'match_id']].drop_duplicates().shape[0]

19564

## **Operazione sulle feature**


Facciamo alcune considerazioni sulle caratteristiche del nostro dataset e identifichiamo possibili miglioramenti per ottimizzarlo in vista del prossimo training del modello:

- Eliminazione delle righe in cui sono stati trovati valori nulli e indesiderati nelle colonna `points_progression` e `sets_required_to_win` individuati in fase di analisi

- Le colonne `sets_required_to_win` e `current_match_state` verranno sostituite con due nuove colonne: `final_set_a` e `final_set_b` per rendere il dataset più chiaro e organizzato

- Trasformazione dei valori T nella colonna `match_format` in formati che siano o S o D

- Operazioni sulla colonna `points_progression`, fondamentale per il successivo addestramento dei modelli


-  Le colonne `event_id`,`match_id`, `stage_id`, `match_duration`, `players_gender`, `match_start_time`, `opponent_id`, `opponent_2_id`,`player_sets_won`, `opponent_sets_won`, `match_scores`, `opponent_points`,`match_format`, `player_id`, `player_2_id`, `sets_required_to_win`, `current_match_state` non sono necessarie per il training dei modelli e possono essere cancellate

### **Gestione dei valori nulli e indesiderati**
Dall'analisi precedente, abbiamo constatato che la feature `points_progression` manca di alcuni suoi valori. La strategia che abbiamo applicato è l'eliminazione delle righe corrispondenti.

In [230]:
# Rimuoviamo tutte le righe del dataset in cui la colonna 'points_progression' contiene valori NaN
dataset = dataset.dropna(subset=["points_progression"])

# Verifichiamo che non esistano più righe con valori NaN nella colonna 'points_progression'
dataset[dataset['points_progression'].isnull()]

Unnamed: 0,event_id,match_id,match_format,players_gender,match_stage,stage_id,match_duration,match_start_time,player_id,player_2_id,opponent_id,opponent_2_id,player_sets_won,opponent_sets_won,match_scores,sets_required_to_win,current_match_state,points_progression,opponent_points,SetWinner


In base a ciò che abbiamo appreso precedentemente da `describe()`, rimuoviamo i le righe non desiderate con valore della colonna `sets_required_to_win` uguale a `2` o `5`.

Questi valori rari potrebbero rappresentare formati di gioco non standard (es. partite amichevoli o partite abbreviate per motivi eccezionali), non rilevanti per l'analisi


In [231]:
# Rimuoviamo le righe contenenti set con punteggi rari che non vogliamo considerare
dataset = dataset[~dataset['sets_required_to_win'].isin([2, 5])]

dataset.describe()

Unnamed: 0,SetWinner
count,72183.0
mean,1.492817
std,0.499952
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,2.0


###**Colonne *sets_required_to_win* e *current_match_state***

Crea l'attuale stato dei set vinti nella partite

In [232]:
set_a = dataset['current_match_state'].str.split('-', expand=True)[0].astype(int)
set_b = dataset['current_match_state'].str.split('-', expand=True)[1].astype(int)

# Crea una colonna 'final_set_a' che vale 1 se il giocatore è a un set dalla vittoria, altrimenti 0
dataset['Set1'] = set_a

# Crea una colonna 'final_set_b' che vale 1 se l'avversario è a un set dalla vittoria, altrimenti 0
dataset['Set2'] = set_b

# Riposizionamento delle nuove colonne
is_final_index = dataset.columns.get_loc('current_match_state')
final_set_a_column = dataset.pop('Set1')
final_set_b_column = dataset.pop('Set2')
dataset.insert(is_final_index + 1, 'Set1', final_set_a_column)
dataset.insert(is_final_index + 2, 'Set2', final_set_b_column)

dataset


Unnamed: 0,event_id,match_id,match_format,players_gender,match_stage,stage_id,match_duration,match_start_time,player_id,player_2_id,...,player_sets_won,opponent_sets_won,match_scores,sets_required_to_win,current_match_state,Set1,Set2,points_progression,opponent_points,SetWinner
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,3,2,"8-11,11-8,11-9,7-11,11-8",3,0-0,0,0,"[0, 0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 6, 6, 6, ...","[1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, ...",2
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,3,2,"8-11,11-8,11-9,7-11,11-8",3,0-1,0,1,"[1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, ...","[0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, ...",1
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,3,2,"8-11,11-8,11-9,7-11,11-8",3,1-1,1,1,"[1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 6, 6, 7, ...","[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 7, 8, 8, ...",1
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,3,2,"8-11,11-8,11-9,7-11,11-8",3,2-1,2,1,"[1, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 6, ...","[0, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 9, ...",2
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,3,2,"8-11,11-8,11-9,7-11,11-8",3,2-2,2,2,"[1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, ...","[0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, ...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,M,GP04,600,1136,2025-04-09 04:20:00,123962,,...,3,0,"11-6,11-6,11-7",3,2-0,2,0,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 8, 9, 9, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 5, 6, ...",1
74735,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,3,1,"11-4,11-8,7-11,11-7",3,0-0,0,0,"[0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10]","[1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4]",1
74736,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,3,1,"11-4,11-8,7-11,11-7",3,1-0,1,0,"[1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 8, 9, ...","[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 6, 6, ...",1
74737,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,3,1,"11-4,11-8,7-11,11-7",3,2-0,2,0,"[1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 7, ...","[0, 0, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 8, ...",2


### **Colonna *player_sets_won e opponent_sets_won***

Derivo il vincitore della partita dalle due colonne

In [233]:
dataset['MatchWinner'] = np.where(dataset['player_sets_won'] > dataset['opponent_sets_won'], 1, 2)

dataset

Unnamed: 0,event_id,match_id,match_format,players_gender,match_stage,stage_id,match_duration,match_start_time,player_id,player_2_id,...,opponent_sets_won,match_scores,sets_required_to_win,current_match_state,Set1,Set2,points_progression,opponent_points,SetWinner,MatchWinner
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,0-0,0,0,"[0, 0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 6, 6, 6, ...","[1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, ...",2,1
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,0-1,0,1,"[1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, ...","[0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, ...",1,1
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,1-1,1,1,"[1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 6, 6, 7, ...","[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 7, 8, 8, ...",1,1
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,2-1,2,1,"[1, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 6, ...","[0, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 9, ...",2,1
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,2-2,2,2,"[1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, ...","[0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, ...",1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,M,GP04,600,1136,2025-04-09 04:20:00,123962,,...,0,"11-6,11-6,11-7",3,2-0,2,0,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 8, 9, 9, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 5, 6, ...",1,1
74735,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,0-0,0,0,"[0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10]","[1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4]",1,1
74736,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,1-0,1,0,"[1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 8, 9, ...","[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 6, 6, ...",1,1
74737,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,2-0,2,0,"[1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 7, ...","[0, 0, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 8, ...",2,1


### **Colonna *match_format***

Modifichiamo la colonna `match_format`, sostituendo tutti i valori `T` (partita di team) con `S` (partita di singolo) o `D` (partita di doppio)

Viene eseguita questa operazione per **evitare la succesiva rimozione di istanze** che potrebbero ritornarci utili nello sviluppo successivo dei nostri modelli.

In [234]:
# Funzione per determinare il match_format quando è Team (Singolo o Doppio)
def determine_match_format(player_2_id):
    if pd.isna(player_2_id):
        return 'S'  # Ritorna 'S' per indicare una partita singola
    else:
        return 'D'  # Ritorna 'D' per indicare una partita doppia


dataset['match_format'] = dataset['player_2_id'].apply(determine_match_format)

dataset

Unnamed: 0,event_id,match_id,match_format,players_gender,match_stage,stage_id,match_duration,match_start_time,player_id,player_2_id,...,opponent_sets_won,match_scores,sets_required_to_win,current_match_state,Set1,Set2,points_progression,opponent_points,SetWinner,MatchWinner
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,0-0,0,0,"[0, 0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 6, 6, 6, ...","[1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, ...",2,1
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,0-1,0,1,"[1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, ...","[0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, ...",1,1
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,1-1,1,1,"[1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 6, 6, 7, ...","[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 7, 8, 8, ...",1,1
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,2-1,2,1,"[1, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 6, ...","[0, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 9, ...",2,1
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,2-2,2,2,"[1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, ...","[0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, ...",1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,M,GP04,600,1136,2025-04-09 04:20:00,123962,,...,0,"11-6,11-6,11-7",3,2-0,2,0,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 8, 9, 9, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 5, 6, ...",1,1
74735,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,0-0,0,0,"[0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10]","[1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4]",1,1
74736,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,1-0,1,0,"[1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 8, 9, ...","[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 6, 6, ...",1,1
74737,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,2-0,2,0,"[1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 7, ...","[0, 0, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 8, ...",2,1


### **Colonna *points_progression***

#### Trasformazione del formato
Ogni istanza della colonna viene convertita in un vettore binario, dove ogni elemento rappresenta un punto nel set. Il valore del vettore sarà 1 se il giocatore ha fatto un punto in quel particolare istante del set, e 2 se il punto l'ha fatto l'avversario. Questo formato rende il vettore della colonna `points_progression` più maneggevole e adatto all'addestramento dei modelli.

In [235]:
def transform_points_progression(points):
    transformed = []

    for i in range(len(points)):
        points[i] = int(points[i])

        # Per il primo punteggio, aggiungiamo 1 se il punteggio è maggiore di 0, altrimenti 2
        if i == 0:
            transformed.append(1 if points[i] > 0 else 2)
        else:
            transformed.append(1 if points[i] > points[i - 1] else 2)

    return transformed

dataset['points_progression'] = dataset['points_progression'].apply(transform_points_progression)

dataset

Unnamed: 0,event_id,match_id,match_format,players_gender,match_stage,stage_id,match_duration,match_start_time,player_id,player_2_id,...,opponent_sets_won,match_scores,sets_required_to_win,current_match_state,Set1,Set2,points_progression,opponent_points,SetWinner,MatchWinner
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,0-0,0,0,"[2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, ...","[1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, ...",2,1
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,0-1,0,1,"[1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, ...","[0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, ...",1,1
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,1-1,1,1,"[1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, ...","[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 7, 8, 8, ...",1,1
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,2-1,2,1,"[1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, ...","[0, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 9, ...",2,1
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,W,FNL,100,1990,2021-11-15 17:32:48,112463,115009.0,...,2,"8-11,11-8,11-9,7-11,11-8",3,2-2,2,2,"[1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, ...","[0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, ...",1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,M,GP04,600,1136,2025-04-09 04:20:00,123962,,...,0,"11-6,11-6,11-7",3,2-0,2,0,"[2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 5, 6, ...",1,1
74735,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,0-0,0,0,"[2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1]","[1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4]",1,1
74736,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,1-0,1,0,"[1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, ...","[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 6, 6, ...",1,1
74737,3088,TTEMSINGLES-----------GP04000100----------,S,M,GP04,100,1554,2025-04-08 04:20:00,133652,,...,1,"11-4,11-8,7-11,11-7",3,2-0,2,0,"[1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, ...","[0, 0, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 8, ...",2,1


### **Eliminazione delle colonne non adatte all'addestramento**

Vantaggi di questa operazione:
- **Eliminando colonne non necessarie**, il dataset diventa più semplice e focalizzato sui dati utili.
- Rimuovendo informazioni irrilevanti, l'addestramento risulta più chiaro e veloce, **evitando di appesantire** il modello con dati superflui.

In [236]:
# Definiamo le colonne da eliminare
columns_to_drop = ['stage_id', 'match_duration','match_stage',
                   'players_gender','match_start_time','opponent_2_id',
                  'player_sets_won', 'opponent_sets_won', 'match_scores',
                  'opponent_points', 'player_2_id',
                    'current_match_state']

# Rimuovo le colonne che non mi interessano
dataset = dataset.drop(columns=columns_to_drop, axis=1)

dataset

Unnamed: 0,event_id,match_id,match_format,player_id,opponent_id,sets_required_to_win,Set1,Set2,points_progression,SetWinner,MatchWinner
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,"[2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, ...",2,1
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,1,"[1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, ...",1,1
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,1,1,"[1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, ...",1,1
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,2,1,"[1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, ...",2,1
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,2,2,"[1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, ...",1,1
...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,123962,133850,3,2,0,"[2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, ...",1,1
74735,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,0,0,"[2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1]",1,1
74736,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,1,0,"[1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, ...",1,1
74737,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,0,"[1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, ...",2,1


# **DATA PREPROCESSING 2**

Creo un match_id utile per i successivi calcoli

In [237]:
dataset['event_match_id'] = dataset.groupby(['event_id', 'match_id'], sort=False).ngroup() + 1
dataset

Unnamed: 0,event_id,match_id,match_format,player_id,opponent_id,sets_required_to_win,Set1,Set2,points_progression,SetWinner,MatchWinner,event_match_id
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,"[2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, ...",2,1,1
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,1,"[1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, ...",1,1,1
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,1,1,"[1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, ...",1,1,1
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,2,1,"[1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, ...",2,1,1
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,2,2,"[1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, ...",1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
74734,3088,TTEMSINGLES-----------GP04000600----------,S,123962,133850,3,2,0,"[2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, ...",1,1,19560
74735,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,0,0,"[2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1]",1,1,19561
74736,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,1,0,"[1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, ...",1,1,19561
74737,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,0,"[1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, ...",2,1,19561


Rimuove le partita con set incompleti
!!! ATTENZIONE: SI PUO' EVITARE SE VOGLIARE FARE PREVISIONI SOLO SUI SET E NON SULLE PARTITE

In [238]:
def is_valid_group(group):
    required = int(group['sets_required_to_win'].iloc[0])
    condition_1 = ((group['SetWinner'] == 1) & (group['Set1'] == required - 1)).any()
    condition_2 = ((group['SetWinner'] == 2) & (group['Set2'] == required - 1)).any()
    return condition_1 or condition_2

# Trova tutti i match_id validi
valid_match_ids = dataset.groupby('event_match_id').filter(is_valid_group)['event_match_id'].unique()

# Filtra il dataframe mantenendo solo i match_id validi
dataset = dataset[dataset['event_match_id'].isin(valid_match_ids)]

num_match_id = dataset['event_match_id'].nunique()
print(f"Numero di match: {num_match_id}")

Numero di match: 17008


Vengono Rimosse circa 2000 partite per punteggi incorretti
circa 1500 3 su 5 e circa 500 4 su 7

Effettuo la decomposizione dei Set e creo nuove feature

In [239]:
dataset['points_progression'] = dataset.apply(
    lambda row: row['points_progression'] + [row['SetWinner']] if isinstance(row['points_progression'], list) else [row['SetWinner']],
    axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['points_progression'] = dataset.apply(


In [241]:
dataset['original_index'] = dataset.index

dataset_expanded = dataset.explode('points_progression').reset_index(drop=True)

dataset_expanded['PointWinner'] = dataset_expanded['points_progression']

# PtSet = posizione del punto nella lista originaria (1-based)
dataset_expanded['PtSet'] = dataset_expanded.groupby('original_index').cumcount() + 1

# Pt = contatore progressivo che si azzera con ogni event_match_id
dataset_expanded['Pt'] = dataset_expanded.groupby('event_match_id').cumcount() + 1

def cumulative_1_2_prior(subdataset):
    count_1 = (subdataset['PointWinner'] == 1).cumsum().shift(fill_value=0)
    count_2 = (subdataset['PointWinner'] == 2).cumsum().shift(fill_value=0)
    return count_1.astype(str) + '-' + count_2.astype(str)

dataset_expanded['Pts'] = dataset_expanded.groupby('original_index', group_keys=False).apply(cumulative_1_2_prior)

dataset_expanded['is_last_in_set'] = (
    dataset_expanded.groupby(['event_match_id', 'original_index']).cumcount(ascending=False) == 0
)

# 2. Prima colonna: ultima riga prima del nuovo set e PointWinner == 1
dataset_expanded['WonSetP1'] = ((dataset_expanded['is_last_in_set']) &
                                 (dataset_expanded['PointWinner'] == 1)).astype(int)

# 3. Seconda colonna: ultima riga prima del nuovo set e PointWinner == 2
dataset_expanded['WonSetP2'] = ((dataset_expanded['is_last_in_set']) &
                                 (dataset_expanded['PointWinner'] == 2)).astype(int)

# 4. Terza colonna: Pts del tipo x-y con x >= 10, y < x, PointWinner == 2
dataset_expanded['LostSetP1'] = dataset_expanded['Pts'].str.extract(r'(\d+)-(\d+)').astype(int).apply(
    lambda row: int(row[0] >= 10 and row[1] < row[0]), axis=1)
dataset_expanded['LostSetP1'] &= (dataset_expanded['PointWinner'] == 2)

# 5. Quarta colonna: Pts del tipo x-y con y >= 10, x < y, PointWinner == 1
dataset_expanded['LostSetP2'] = dataset_expanded['Pts'].str.extract(r'(\d+)-(\d+)').astype(int).apply(
    lambda row: int(row[1] >= 10 and row[0] < row[1]), axis=1)
dataset_expanded['LostSetP2'] &= (dataset_expanded['PointWinner'] == 1)

# Converti i booleani in interi
dataset_expanded['LostSetP1'] = dataset_expanded['LostSetP1'].astype(int)
dataset_expanded['LostSetP2'] = dataset_expanded['LostSetP2'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['original_index'] = dataset.index
  dataset_expanded['Pts'] = dataset_expanded.groupby('original_index', group_keys=False).apply(cumulative_1_2_prior)


In [243]:
# Definiamo le colonne da eliminare
columns_to_drop = ['is_last_in_set', 'original_index']

dataset_expanded = dataset_expanded.drop(columns=columns_to_drop, axis=1)

In [242]:
def normalize_deuce(value):
    try:
        x_str, y_str = value.split('-')
        x, y = int(x_str), int(y_str)

        if x >= 10 and y >= 10:
            if x == y:
                return '10-10'
            elif x < y:
                return '10-AD'
            else:  # x > y
                return 'AD-10'
        else:
            return value
    except:
        return value  # In caso di errore (formato imprevisto), restituiamo il valore originale

dataset_expanded['Pts'] = dataset_expanded['Pts'].apply(normalize_deuce)
dataset_expanded

Unnamed: 0,event_id,match_id,match_format,player_id,opponent_id,sets_required_to_win,Set1,Set2,points_progression,SetWinner,...,original_index,PointWinner,PtSet,Pt,Pts,is_last_in_set,WonSetP1,WonSetP2,LostSetP1,LostSetP2
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,2,2,...,0,2,1,1,0-0,False,0,0,0,0
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,2,2,...,0,2,2,2,0-1,False,0,0,0,0
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,1,2,...,0,1,3,3,0-2,False,0,0,0,0
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,1,2,...,0,1,4,4,1-2,False,0,0,0,0
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,2,2,...,0,2,5,5,2-2,False,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222644,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,1,1,...,74738,1,14,66,8-5,False,0,0,0,0
1222645,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,2,1,...,74738,2,15,67,9-5,False,0,0,0,0
1222646,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,2,1,...,74738,2,16,68,9-6,False,0,0,0,0
1222647,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,1,1,...,74738,1,17,69,9-7,False,0,0,0,0




In [244]:
dataset = dataset_expanded
dataset

Unnamed: 0,event_id,match_id,match_format,player_id,opponent_id,sets_required_to_win,Set1,Set2,points_progression,SetWinner,MatchWinner,event_match_id,PointWinner,PtSet,Pt,Pts,WonSetP1,WonSetP2,LostSetP1,LostSetP2
0,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,2,2,1,1,2,1,1,0-0,0,0,0,0
1,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,2,2,1,1,2,2,2,0-1,0,0,0,0
2,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,1,2,1,1,1,3,3,0-2,0,0,0,0
3,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,1,2,1,1,1,4,4,1-2,0,0,0,0
4,2234,TTEWDOUBLES-----------FNL-000100----------,D,112463,117294,3,0,0,2,2,1,1,2,5,5,2-2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222644,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,1,1,1,19561,1,14,66,8-5,0,0,0,0
1222645,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,2,1,1,19561,2,15,67,9-5,0,0,0,0
1222646,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,2,1,1,19561,2,16,68,9-6,0,0,0,0
1222647,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,1,1,1,19561,1,17,69,9-7,0,0,0,0




# **SALVATAGGIO**


### **Operazione su *single_matches_dataset***

Attraverso l'analisi della colonna `match_format` generiamo un nuovo dataset contente solo le partite di singolo.

Questo dataset potrà essere successivamente utile nel testing dei modelli.

In [245]:
# Rimuoviamo dal dataset le partite di doppio
rows_to_remove = dataset[dataset['match_format'] == 'D'].index

dataset_singles = dataset.drop(index=rows_to_remove)

dataset_singles

Unnamed: 0,event_id,match_id,match_format,player_id,opponent_id,sets_required_to_win,Set1,Set2,points_progression,SetWinner,MatchWinner,event_match_id,PointWinner,PtSet,Pt,Pts,WonSetP1,WonSetP2,LostSetP1,LostSetP2
95,2234,TTEMTEAM--------------FNL-00010001--------,S,111683,113419,3,0,0,2,1,1,3,2,1,1,0-0,0,0,0,0
96,2234,TTEMTEAM--------------FNL-00010001--------,S,111683,113419,3,0,0,1,1,1,3,1,2,2,0-1,0,0,0,0
97,2234,TTEMTEAM--------------FNL-00010001--------,S,111683,113419,3,0,0,1,1,1,3,1,3,3,1-1,0,0,0,0
98,2234,TTEMTEAM--------------FNL-00010001--------,S,111683,113419,3,0,0,2,1,1,3,2,4,4,2-1,0,0,0,0
99,2234,TTEMTEAM--------------FNL-00010001--------,S,111683,113419,3,0,0,2,1,1,3,2,5,5,2-2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222644,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,1,1,1,19561,1,14,66,8-5,0,0,0,0
1222645,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,2,1,1,19561,2,15,67,9-5,0,0,0,0
1222646,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,2,1,1,19561,2,16,68,9-6,0,0,0,0
1222647,3088,TTEMSINGLES-----------GP04000100----------,S,133652,133850,3,2,1,1,1,1,19561,1,17,69,9-7,0,0,0,0


In [246]:
# Definiamo le colonne da eliminare
columns_to_drop = ['event_id', 'match_id','match_format','sets_required_to_win']

# Rimuovo le colonne che non mi interessano
dataset = dataset.drop(columns=columns_to_drop, axis=1)
dataset_singles = dataset_singles.drop(columns=columns_to_drop, axis=1)

dataset_singles

Unnamed: 0,player_id,opponent_id,Set1,Set2,points_progression,SetWinner,MatchWinner,event_match_id,PointWinner,PtSet,Pt,Pts,WonSetP1,WonSetP2,LostSetP1,LostSetP2
95,111683,113419,0,0,2,1,1,3,2,1,1,0-0,0,0,0,0
96,111683,113419,0,0,1,1,1,3,1,2,2,0-1,0,0,0,0
97,111683,113419,0,0,1,1,1,3,1,3,3,1-1,0,0,0,0
98,111683,113419,0,0,2,1,1,3,2,4,4,2-1,0,0,0,0
99,111683,113419,0,0,2,1,1,3,2,5,5,2-2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222644,133652,133850,2,1,1,1,1,19561,1,14,66,8-5,0,0,0,0
1222645,133652,133850,2,1,2,1,1,19561,2,15,67,9-5,0,0,0,0
1222646,133652,133850,2,1,2,1,1,19561,2,16,68,9-6,0,0,0,0
1222647,133652,133850,2,1,1,1,1,19561,1,17,69,9-7,0,0,0,0


### **Salvataggio dei nuovi dataset**

In [247]:
# Salviamo il dataset pulito dai valori indesiderati
dataset.to_csv("TT.csv", index=False)

In [248]:
# Salviamo il dataset finale delle partite di singolo
dataset_singles.to_csv("singles_TT.csv", index=False)

MEMO: RICORDATI DI RICONSIDERARE LE SEQUENZE DI PUNTEGGIO CHE HAI TOLTO come i vantaggi E DI GENERARE LE ISTANZE INVERTITE DEL DATASET però tenendo presento il dataleakage in fase di testing, quindi una sequenza nel testing non può essere già vista precedentemente