<a href="https://colab.research.google.com/github/oraziotorre/MomentumShiftAI/blob/main/ModelsDevelopment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [88]:
# Importiamo il dataset ottenuto tramite le operazioni di Data Preprocessing
dataset = pd.read_csv("tennis_2010-now.csv")

In [89]:
# Split 'Pts'
dataset[['Pt1_raw', 'Pt2_raw']] = dataset['Pts'].astype(str).str.split('-', expand=True)

# Set numerici
dataset['Gm1'] = pd.to_numeric(dataset['Gm1'], errors='coerce')
dataset['Gm2'] = pd.to_numeric(dataset['Gm2'], errors='coerce')

# Maschera tiebreak SOLO se 6-6
is_tiebreak = (dataset['Gm1'] == 6) & (dataset['Gm2'] == 6)

normal_score_map = {'0': 0, '15': 1, '30': 2, '40': 3, 'AD': 4}
tiebreak_score_map = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, 'AD': 7}

pt1_normal = dataset['Pt1_raw'].map(normal_score_map)
pt2_normal = dataset['Pt2_raw'].map(normal_score_map)

pt1_tb = dataset['Pt1_raw'].map(tiebreak_score_map)
pt2_tb = dataset['Pt2_raw'].map(tiebreak_score_map)

dataset['Pt1'] = np.where(is_tiebreak, pt1_tb, pt1_normal)
dataset['Pt2'] = np.where(is_tiebreak, pt2_tb, pt2_normal)

# Definisci IsDeuce per game normale
normal_deuce = (
    (~is_tiebreak) &
    (
        ((dataset['Pt1'] == 3) & (dataset['Pt2'] == 3)) |   # 40-40
        ((dataset['Pt1'] == 3) & (dataset['Pt2'] == 4)) |   # 40-ADV
        ((dataset['Pt1'] == 4) & (dataset['Pt2'] == 3))     # ADV-40
    )
)

# Definisci IsDeuce per tiebreak
tiebreak_deuce = (
    (is_tiebreak) &
    (dataset['Pt1'] >= 6) &
    (dataset['Pt2'] >= 6) &
    (abs(dataset['Pt1'] - dataset['Pt2']) <= 1)
)

# Combina i due casi
dataset['IsDeuce'] = (normal_deuce | tiebreak_deuce).astype(int)

dataset.drop(columns=['Pt1_raw', 'Pt2_raw'], inplace=True)

In [3]:
dataset = dataset.drop(columns=['PointType','Pts'])#,'Player1','Player2','SetID', 'SetWinner','PtSet'
dataset

Unnamed: 0.1,Unnamed: 0,match_id,Pt,Set1,Set2,Gm1,Gm2,Pts,Player1,Player2,...,serve_ace_1,serve_ace_2,serve_miss2_1,serve_miss1_2,rally_winner_1,rally_winner_2,rally_forced2_1,rally_forced1_2,rally_unforced1_2,rally_unforced2_1
0,0,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,1,0,0,0,0,0-0,106329,100644,...,0,0,0,0,0,0,0,0,0,1
1,1,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,2,0,0,0,0,0-15,106329,100644,...,0,0,0,0,0,0,0,1,0,0
2,2,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,3,0,0,0,0,0-30,106329,100644,...,1,0,0,0,0,0,0,0,0,0
3,3,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,4,0,0,0,0,15-30,106329,100644,...,0,0,0,0,0,0,0,0,0,1
4,4,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,5,0,0,0,0,15-40,106329,100644,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
710163,731734,20100108-M-Doha-SF-Roger_Federer-Nikolay_Davyd...,119,0,1,3,5,AD-40,103819,103786,...,0,0,0,0,0,0,1,0,0,0
710164,731735,20100108-M-Doha-SF-Roger_Federer-Nikolay_Davyd...,120,0,1,4,5,0-0,103819,103786,...,0,0,0,0,0,0,0,1,0,0
710165,731736,20100108-M-Doha-SF-Roger_Federer-Nikolay_Davyd...,121,0,1,4,5,0-15,103819,103786,...,0,0,0,0,0,0,0,0,0,1
710166,731737,20100108-M-Doha-SF-Roger_Federer-Nikolay_Davyd...,122,0,1,4,5,0-30,103819,103786,...,0,0,0,0,0,1,0,0,0,0


#**Train-Test split**

In [4]:
# Generazione delle partite inverse per aumentare la dimensione del dataset

def augment_with_symmetric(df: pd.DataFrame) -> pd.DataFrame:
    df_swapped = df.copy()

    # Colonne da swappare a coppie
    swap_pairs = [
        ('Set1', 'Set2'),
        ('Gm1', 'Gm2'),
        ('Pt1', 'Pt2'),
        ('Player1', 'Player2'),
        ('Ranking1', 'Ranking2'),
        ('p1_win_nobreak_point', 'p2_win_nobreak_point'),
        ('p1_win_break_point', 'p2_win_break_point'),
        ('p1_lost_nobreak_point', 'p2_lost_nobreak_point'),
        ('p1_lost_break_point', 'p2_lost_break_point'),
        ('serve_ace_1', 'serve_ace_2'),
        ('serve_miss2_1', 'serve_miss1_2'),
        ('rally_winner_1', 'rally_winner_2'),
        ('rally_forced2_1', 'rally_forced1_2'),
        ('rally_unforced1_2', 'rally_unforced2_1')
    ]
    for col1, col2 in swap_pairs:
        df_swapped[[col1, col2]] = df[[col2, col1]].values

    # Colonne dove 1 <-> 2
    invert_1_2_cols = ['Svr', 'PtWinner', 'SetWinner', 'MatchWinner']
    for col in invert_1_2_cols:
        df_swapped[col] = df[col].replace({1: 2, 2: 1})


    # Aggiunta del suffisso 'simm' a match_id e set_id, se presenti
    for col in ['match_id', 'SetID']:
        if col in df_swapped.columns:
            df_swapped[col] = df_swapped[col].astype(str) + '_simm'

    return pd.concat([df, df_swapped], ignore_index=True)

In [97]:
# 1️⃣ Scaling e Normalizzazione

def preprocess_data(train_data, test_data):
    # Copie di sicurezza
    train_data = train_data.copy()
    test_data = test_data.copy()

    # === 1. Colonne da trasformare ===
    minmax_cols = ['Set1', 'Set2', 'Gm1', 'Gm2', 'Pt1', 'Pt2']
    standard_cols = ['Ranking1', 'Ranking2']
    others = ['Svr', 'PtWinner','IsDeuce']

    # === 2. Altre feature da conservare (numeriche/categoriche già pronte) ===
    exclude_cols = minmax_cols + standard_cols + others + \
                   ['match_id', 'SetID', 'Player1', 'Player2', 'PtSet', 'SetWinner', 'MatchWinner']

    ################ other_features = [col for col in train_data.columns if col not in exclude_cols]

    # === 3. Normalizzazione ===
    minmax_scaler = MinMaxScaler()
    train_data[minmax_cols] = minmax_scaler.fit_transform(train_data[minmax_cols])
    test_data[minmax_cols] = minmax_scaler.transform(test_data[minmax_cols])

    # === 4. Standardizzazione ===
    standard_scaler = StandardScaler()
    train_data[standard_cols] = standard_scaler.fit_transform(train_data[standard_cols])
    test_data[standard_cols] = standard_scaler.transform(test_data[standard_cols])


    feature_cols = minmax_cols +  others###### +standard_cols + pts_cols + svr_cols  + other_features

    # === 8. Target: conversione [1,2] → [0,1] ===
    target_cols = ['Svr', 'PtWinner', 'SetWinner', 'MatchWinner']
    for col in target_cols:
        train_data = train_data[train_data[col].isin([1, 2])]
        test_data = test_data[test_data[col].isin([1, 2])]
        train_data[col] = train_data[col].astype(int) - 1
        test_data[col] = test_data[col].astype(int) - 1

    # === 9. Conversione finale a float ===
    train_data[feature_cols] = train_data[feature_cols]
    test_data[feature_cols] = test_data[feature_cols]

    return train_data, test_data, feature_cols

In [119]:
# 2️⃣ Creazione sequenze

def create_sequences(data, feature_cols, target_col='SetWinner'):
    X_sequences = []
    y_labels = []
    lengths = []

    # Ordina i punti all'interno di ciascun match
    data = data.sort_values(['SetID', 'PtSet']).reset_index(drop=True)

    for match_id, df_match in data.groupby('SetID'):
        if target_col not in df_match.columns:
            continue  # Salta se manca il target

        seq_data = df_match[feature_cols].to_numpy(dtype=np.float32)
        X_seq = torch.tensor(seq_data, dtype=torch.float32)
        X_sequences.append(X_seq)

        # Prende il target del primo punto del match (assunto costante per tutti)
        match_winner = int(df_match[target_col].iloc[0])
        y_labels.append(match_winner)
        lengths.append(len(df_match))

    if not X_sequences:
        raise ValueError("Nessuna sequenza valida trovata nei dati.")

    # Padding delle sequenze
    X_padded = pad_sequence(X_sequences, batch_first=True)
    y_tensor = torch.tensor(y_labels, dtype=torch.long)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return X_padded, y_tensor, lengths

In [120]:
# 3️⃣ Pipeline completa + DataLoader

def prepare_datasets(train_data, test_data, target_col='SetWinner', batch_size=32):
    # Preprocessing dati
    train_proc, test_proc, feature_cols = preprocess_data(train_data, test_data)
    # Creazione sequenze
    X_train, y_train, train_lengths = create_sequences(train_proc, feature_cols, target_col)
    X_test, y_test, test_lengths = create_sequences(test_proc, feature_cols, target_col)

    # Costruzione dei TensorDataset
    train_dataset = TensorDataset(X_train, y_train, train_lengths)
    test_dataset = TensorDataset(X_test, y_test, test_lengths)

    # Funzione per ordinare le sequenze per lunghezza (richiesto da LSTM)
    def collate_fn(batch):
        inputs, targets, lengths = zip(*batch)
        inputs = torch.stack(inputs)
        targets = torch.stack(targets)
        lengths = torch.stack(lengths)
        sorted_idx = torch.argsort(lengths, descending=True)
        return inputs[sorted_idx], targets[sorted_idx], lengths[sorted_idx]

    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, test_loader, feature_cols

In [121]:
# Estrai tutti gli ID unici delle partite
match_ids = dataset['match_id'].unique()

# Split degli ID in training e test (es. 80% train, 20% test)
train_ids, test_ids = train_test_split(match_ids, test_size=0.2, random_state=42)

# Split per match-id
train_data = dataset[dataset['match_id'].isin(train_ids)].reset_index(drop=True)
test_data = dataset[dataset['match_id'].isin(test_ids)].reset_index(drop=True)

#train_data = augment_with_symmetric(train_data)
#test_data = augment_with_symmetric(test_data)

train_loader, test_loader, feature_cols = prepare_datasets(train_data, test_data, target_col='SetWinner', batch_size=32)
feature_cols

['Set1', 'Set2', 'Gm1', 'Gm2', 'Pt1', 'Pt2', 'Svr', 'PtWinner', 'IsDeuce']

In [123]:
# Estrai il primo batch dal DataLoader
batch = next(iter(train_loader))

input, mask, label = batch  # se batch ha esattamente 3 elementi

print("Input[0]:", input[0][0])
print("Mask[0]:", mask[0])
print("Label[0]:", label[0])

Input[0]: tensor([0.5000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000])
Mask[0]: tensor(1)
Label[0]: tensor(96)


In [96]:
train_data

Unnamed: 0.1,Unnamed: 0,match_id,Pt,Set1,Set2,Gm1,Gm2,Pts,Player1,Player2,...,serve_miss1_2,rally_winner_1,rally_winner_2,rally_forced2_1,rally_forced1_2,rally_unforced1_2,rally_unforced2_1,Pt1,Pt2,IsDeuce
0,0,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,1,0,0,0,0,0-0,106329,100644,...,0,0,0,0,0,0,1,0.0,0.0,0
1,1,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,2,0,0,0,0,0-15,106329,100644,...,0,0,0,0,1,0,0,0.0,1.0,0
2,2,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,3,0,0,0,0,0-30,106329,100644,...,0,0,0,0,0,0,0,0.0,2.0,0
3,3,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,4,0,0,0,0,15-30,106329,100644,...,0,0,0,0,0,0,1,1.0,2.0,0
4,4,20241229-M-United_Cup-RR-Thiago_Monteiro-Alexa...,5,0,0,0,0,15-40,106329,100644,...,0,0,0,0,0,1,0,1.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568617,731184,20100124-M-Australian_Open-R16-Andy_Roddick-Fe...,295,2,2,5,2,40-AD,104053,103602,...,0,0,0,0,0,1,0,3.0,4.0,1
568618,731185,20100124-M-Australian_Open-R16-Andy_Roddick-Fe...,296,2,2,5,2,40-40,104053,103602,...,0,0,0,0,0,1,0,3.0,3.0,1
568619,731186,20100124-M-Australian_Open-R16-Andy_Roddick-Fe...,297,2,2,5,2,AD-40,104053,103602,...,0,0,0,0,0,0,1,4.0,3.0,1
568620,731187,20100124-M-Australian_Open-R16-Andy_Roddick-Fe...,298,2,2,5,2,40-40,104053,103602,...,0,0,0,0,0,1,0,3.0,3.0,1


#**Model 1**

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=1, num_classes=2, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, dropout=dropout, bidirectional=False)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        # Packed sequence per gestire sequenze di lunghezza variabile
        packed_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=True)
        packed_output, (h_n, _) = self.lstm(packed_input)

        # Usa l'ultimo hidden state
        out = self.fc(h_n[-1])  # h_n shape: (num_layers, batch, hidden_size)
        return out

In [None]:
def train_lstm(model, train_loader, test_loader, num_epochs=10, lr=1e-3, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for X, y, lengths in train_loader:
            X, y, lengths = X.to(device), y.to(device), lengths.to(device)

            optimizer.zero_grad()
            outputs = model(X, lengths)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

        evaluate_lstm(model, test_loader, device=device)

In [None]:
def evaluate_lstm(model, data_loader, device='cpu'):
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for X, y, lengths in data_loader:
            X, lengths = X.to(device), lengths.to(device)
            outputs = model(X, lengths)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            y_true.extend(y.numpy())
            y_pred.extend(preds)

    acc = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f}")
    return acc

NameError: name 'prepare_datasets' is not defined

In [None]:

# Inizializza il modello
input_size = len(feature_cols)
model = LSTMClassifier(input_size=input_size)

# Allenamento
train_lstm(model, train_loader, test_loader, num_epochs=10)

# **Model 2**