<a href="https://colab.research.google.com/github/oraziotorre/MomentumShiftAI/blob/main/ModelsDevelopment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [127]:
# Importiamo il dataset ottenuto tramite le operazioni di Data Preprocessing
dataset = pd.read_csv("tennis_2010-now.csv")

In [128]:
# Split 'Pts'
dataset[['Pt1_raw', 'Pt2_raw']] = dataset['Pts'].astype(str).str.split('-', expand=True)

# Set numerici
dataset['Gm1'] = pd.to_numeric(dataset['Gm1'], errors='coerce')
dataset['Gm2'] = pd.to_numeric(dataset['Gm2'], errors='coerce')

# Maschera tiebreak SOLO se 6-6
is_tiebreak = (dataset['Gm1'] == 6) & (dataset['Gm2'] == 6)

normal_score_map = {'0': 0, '15': 1, '30': 2, '40': 3, 'AD': 4}
tiebreak_score_map = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, 'AD': 7}

pt1_normal = dataset['Pt1_raw'].map(normal_score_map)
pt2_normal = dataset['Pt2_raw'].map(normal_score_map)

pt1_tb = dataset['Pt1_raw'].map(tiebreak_score_map)
pt2_tb = dataset['Pt2_raw'].map(tiebreak_score_map)

dataset['Pt1'] = np.where(is_tiebreak, pt1_tb, pt1_normal)
dataset['Pt2'] = np.where(is_tiebreak, pt2_tb, pt2_normal)

# Definisci IsDeuce per game normale
normal_deuce = (
    (~is_tiebreak) &
    (
        ((dataset['Pt1'] == 3) & (dataset['Pt2'] == 3)) |   # 40-40
        ((dataset['Pt1'] == 3) & (dataset['Pt2'] == 4)) |   # 40-ADV
        ((dataset['Pt1'] == 4) & (dataset['Pt2'] == 3))     # ADV-40
    )
)

# Definisci IsDeuce per tiebreak
tiebreak_deuce = (
    (is_tiebreak) &
    (dataset['Pt1'] >= 6) &
    (dataset['Pt2'] >= 6) &
    (abs(dataset['Pt1'] - dataset['Pt2']) <= 1)
)

# Combina i due casi
dataset['IsDeuce'] = (normal_deuce | tiebreak_deuce).astype(int)

# Aggiungi colonna IsTieBreak
dataset['IsTieBreak'] = is_tiebreak.astype(int)

dataset.drop(columns=['Pt1_raw', 'Pt2_raw'], inplace=True)

In [129]:
print(dataset.iloc[1470:1505][['Pts', 'Set1', 'Set2', 'Pt1', 'Pt2', 'IsDeuce','IsTieBreak']])

        Pts  Set1  Set2  Pt1  Pt2  IsDeuce  IsTieBreak
1470    0-0     0     1  0.0  0.0        0           0
1471   0-15     0     1  0.0  1.0        0           0
1472  15-15     0     1  1.0  1.0        0           0
1473  30-15     0     1  2.0  1.0        0           0
1474  30-30     0     1  2.0  2.0        0           0
1475  40-30     0     1  3.0  2.0        0           0
1476  40-40     0     1  3.0  3.0        1           0
1477  AD-40     0     1  4.0  3.0        1           0
1478    0-0     0     1  0.0  0.0        0           0
1479   15-0     0     1  1.0  0.0        0           0
1480  15-15     0     1  1.0  1.0        0           0
1481  15-30     0     1  1.0  2.0        0           0
1482  30-30     0     1  2.0  2.0        0           0
1483  30-40     0     1  2.0  3.0        0           0
1484    0-0     0     1  0.0  0.0        0           1
1485    0-1     0     1  0.0  1.0        0           1
1486    1-1     0     1  1.0  1.0        0           1
1487    1-

In [130]:
dataset = dataset.drop(columns=['PointType','Pts'])#,'Player1','Player2','SetID', 'SetWinner','PtSet'

#**Train-Test split**

In [131]:
# Generazione delle partite inverse per aumentare la dimensione del dataset

def augment_with_symmetric(df: pd.DataFrame) -> pd.DataFrame:
    df_swapped = df.copy()

    # Colonne da swappare a coppie
    swap_pairs = [
        ('Set1', 'Set2'),
        ('Gm1', 'Gm2'),
        ('Pt1', 'Pt2'),
        ('Player1', 'Player2'),
        ('Ranking1', 'Ranking2'),
        ('p1_win_nobreak_point', 'p2_win_nobreak_point'),
        ('p1_win_break_point', 'p2_win_break_point'),
        ('p1_lost_nobreak_point', 'p2_lost_nobreak_point'),
        ('p1_lost_break_point', 'p2_lost_break_point'),
        ('serve_ace_1', 'serve_ace_2'),
        ('serve_miss2_1', 'serve_miss1_2'),
        ('rally_winner_1', 'rally_winner_2'),
        ('rally_forced2_1', 'rally_forced1_2'),
        ('rally_unforced1_2', 'rally_unforced2_1')
    ]
    for col1, col2 in swap_pairs:
        df_swapped[[col1, col2]] = df[[col2, col1]].values

    # Colonne dove 1 <-> 2
    invert_1_2_cols = ['Svr', 'PtWinner', 'SetWinner', 'MatchWinner']
    for col in invert_1_2_cols:
        df_swapped[col] = df[col].replace({1: 2, 2: 1})


    # Aggiunta del suffisso 'simm' a match_id e set_id, se presenti
    for col in ['match_id', 'SetID']:
        if col in df_swapped.columns:
            df_swapped[col] = df_swapped[col].astype(str) + '_simm'

    return pd.concat([df, df_swapped], ignore_index=True)

In [97]:
# 1️⃣ Scaling e Normalizzazione

def preprocess_data(train_data, test_data):
    # Copie di sicurezza
    train_data = train_data.copy()
    test_data = test_data.copy()

    # === 1. Colonne da trasformare ===
    minmax_cols = ['Set1', 'Set2', 'Gm1', 'Gm2', 'Pt1', 'Pt2']
    standard_cols = ['Ranking1', 'Ranking2']
    others = ['Svr', 'PtWinner','IsDeuce','IsTieBreak']

    # === 2. Altre feature da conservare (numeriche/categoriche già pronte) ===
    exclude_cols = minmax_cols + standard_cols + others + \
                   ['match_id', 'SetID', 'Player1', 'Player2', 'PtSet', 'SetWinner', 'MatchWinner']

    ################ other_features = [col for col in train_data.columns if col not in exclude_cols]

    # === 3. Normalizzazione ===
    minmax_scaler = MinMaxScaler()
    train_data[minmax_cols] = minmax_scaler.fit_transform(train_data[minmax_cols])
    test_data[minmax_cols] = minmax_scaler.transform(test_data[minmax_cols])

    # === 4. Standardizzazione ===
    standard_scaler = StandardScaler()
    train_data[standard_cols] = standard_scaler.fit_transform(train_data[standard_cols])
    test_data[standard_cols] = standard_scaler.transform(test_data[standard_cols])


    feature_cols = minmax_cols +  others###### +standard_cols + pts_cols + svr_cols  + other_features

    # === 8. Target: conversione [1,2] → [0,1] ===
    target_cols = ['Svr', 'PtWinner', 'SetWinner', 'MatchWinner']
    for col in target_cols:
        train_data = train_data[train_data[col].isin([1, 2])]
        test_data = test_data[test_data[col].isin([1, 2])]
        train_data[col] = train_data[col].astype(int) - 1
        test_data[col] = test_data[col].astype(int) - 1

    # === 9. Conversione finale a float ===
    train_data[feature_cols] = train_data[feature_cols]
    test_data[feature_cols] = test_data[feature_cols]

    return train_data, test_data, feature_cols

In [200]:
# 2️⃣ Creazione sequenze

def create_sequences(data, feature_cols, target_col='SetWinner'):
    X_sequences = []
    y_labels = []
    lengths = []

    # Ordina i punti all'interno di ciascun match
    data = data.sort_values(['SetID', 'PtSet']).reset_index(drop=True)

    for match_id, df_match in data.groupby('SetID'):

        seq_data = df_match[feature_cols].to_numpy(dtype=np.float32)
        X_seq = torch.tensor(seq_data, dtype=torch.float32)
        X_sequences.append(X_seq)

        # Prende il target del primo punto del match (assunto costante per tutti)
        target_winner = int(df_match[target_col].iloc[0])
        y_labels.append(target_winner)
        lengths.append(len(df_match))
        '''
        # Stampa la sequenza e il target per il SetID corrente
        print(f"SetID: {match_id}")
        print("Sequence (features):")
        print(X_seq)
        print("Target (SetWinner):", target_winner)
        print("-" * 40)
        '''
    # Padding delle sequenze
    X_padded = pad_sequence(X_sequences, batch_first=True)
    y_tensor = torch.tensor(y_labels, dtype=torch.long)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return X_padded, y_tensor, lengths

In [202]:
# 3️⃣ Pipeline completa + DataLoader

def prepare_datasets(train_data, test_data, target_col='SetWinner', batch_size=32):
    # Preprocessing dati
    train_proc, test_proc, feature_cols = preprocess_data(train_data, test_data)
    # Creazione sequenze
    X_train, y_train, train_lengths = create_sequences(train_proc, feature_cols, target_col)
    X_test, y_test, test_lengths = create_sequences(test_proc, feature_cols, target_col)

    # Costruzione dei TensorDataset
    train_dataset = TensorDataset(X_train, y_train, train_lengths)
    test_dataset = TensorDataset(X_test, y_test, test_lengths)


    # Funzione per ordinare le sequenze per lunghezza (richiesto da LSTM)
    def collate_fn(batch):
        inputs, targets, lengths = zip(*batch)
        inputs = torch.stack(inputs)
        targets = torch.stack(targets)
        lengths = torch.stack(lengths)
        sorted_idx = torch.argsort(lengths, descending=True)
        return inputs[sorted_idx], targets[sorted_idx], lengths[sorted_idx]


    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, test_loader, feature_cols

In [203]:
# Estrai tutti gli ID unici delle partite
match_ids = dataset['match_id'].unique()

# Split degli ID in training e test (es. 80% train, 20% test)
train_ids, test_ids = train_test_split(match_ids, test_size=0.2, random_state=42)

# Split per match-id
train_data = dataset[dataset['match_id'].isin(train_ids)].reset_index(drop=True)
test_data = dataset[dataset['match_id'].isin(test_ids)].reset_index(drop=True)

#train_data = augment_with_symmetric(train_data)
#test_data = augment_with_symmetric(test_data)

train_loader, test_loader, feature_cols = prepare_datasets(train_data, test_data, target_col='SetWinner', batch_size=32)
feature_cols

['Set1', 'Set2', 'Gm1', 'Gm2', 'Pt1', 'Pt2', 'Svr', 'PtWinner', 'IsDeuce']

In [None]:
# Estrai il primo batch dal DataLoader
batch = next(iter(train_loader))

# Stampa la struttura del batch
print(type(batch))         # Di solito è una tupla o un dizionario
print(len(batch))          # Quanti elementi contiene (es. (input, label) => 2)
print(batch)

#**Model 1**

In [196]:
class RealTimeLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                           batch_first=True, dropout=dropout)
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.fc = nn.Linear(hidden_size, 2)

    def forward(self, x, lengths):
        # Packing sequence
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)

        # LSTM
        packed_out, _ = self.lstm(packed)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)

        # Attention mechanism
        weights = self.attention(out)
        context = torch.sum(weights * out, dim=1)

        # Classification
        return self.fc(context)

# 2️⃣ Funzione di training e validazione
def train_model(model, train_loader, test_loader, learning_rate=0.001, epochs=20, patience=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=0.5,
        patience=2,
        verbose=True
    )

    best_accuracy = 0
    counter = 0

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for inputs, labels, lengths in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs, lengths)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)

        # Validation
        model.eval()
        correct = 0
        total = 0
        val_loss = 0
        with torch.no_grad():
            for inputs, labels, lengths in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs, lengths)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                _, predicted = torch.max(outputs.data, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        # Calculate metrics
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(test_loader.dataset)
        accuracy = 100 * correct / total

        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Accuracy: {accuracy:.2f}%')

        # Early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
            print('Model saved!')
        else:
            counter += 1
            if counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

        scheduler.step(accuracy)

    # Load best model
    model.load_state_dict(torch.load('best_model.pth'))
    return model




# Parametri ragionevoli
input_size = len(feature_cols)  # ~5-7 features
hidden_size = 64
num_layers = 1

# Addestra con early stopping
train_model(model, train_loader, test_loader,
            learning_rate=1e-3,
            patience=5,
            epochs=50)

KeyboardInterrupt: 

# **Model 2**