In [18]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import torch

# Загружаем датасет
atp_matches_2024 = pd.read_csv('atp_matches_2024.csv')
atp_matches_2023 = pd.read_csv('atp_matches_2023.csv')
atp_matches_2022 = pd.read_csv('atp_matches_2022.csv')

atp_matches_qual_chall_2023 = pd.read_csv('atp_matches_qual_chall_2023.csv')
atp_matches_qual_chall_2024 = pd.read_csv('atp_matches_qual_chall_2024.csv')
atp_matches_qual_chall_2022 = pd.read_csv('atp_matches_qual_chall_2022.csv')

df = pd.concat([atp_matches_2024,atp_matches_2023, atp_matches_qual_chall_2024, atp_matches_qual_chall_2023, atp_matches_2022, atp_matches_qual_chall_2022], ignore_index=True)

df['tourney_name'] = df['tourney_name'].apply(lambda x: 'Davis Cup' if x.startswith('Davis Cup') else x)
df = df[df['tourney_name']!='Davis Cup']
df = df[df['tourney_name']!='United Cup']
df = df[df['surface'].isin(['Hard','Clay','Grass'])]
df = df[df['best_of'] == 3]
df = df[df['winner_rank'].notnull() & df['loser_rank'].notnull() & df['winner_hand'].notnull() & df['loser_hand'].notnull()]

# --- Подготовка данных ---
# сортировка по дате и номеру матча
df = df.sort_values(by=["tourney_date", "match_num"]).reset_index(drop=True)

# Унифицируем формат даты (в int может быть YYYYMMDD)
df["tourney_date"] = pd.to_datetime(df["tourney_date"], format="%Y%m%d")

# Убираем матчи без подач
df = df[(df["w_svpt"] > 0) & (df["l_svpt"] > 0) & (df["w_1stIn"] > 0) & (df["l_1stIn"] > 0)]



def extract_relative_features(row, player="winner"):
    """
    row: строка датафрейма (один матч)
    player: 'winner' или 'loser' (для кого считаем историю)
    Возвращает разницу статистики игрока и его соперника
    """
    if player == "winner":
        p, o = "w_", "l_"
        pr, or_ = "winner", "loser"
    else:
        p, o = "l_", "w_"
        pr, or_ = "loser", "winner"
    
    feats = {}
    # Эйсы и двойные
    feats["aces_diff"] = row[f"{p}ace"] - row[f"{o}ace"]
    feats["df_diff"] = row[f"{p}df"] - row[f"{o}df"]

    # % первой подачи
    feats["first_in_diff"] = (row[f"{p}1stIn"]/row[f"{p}svpt"]) - \
                             (row[f"{o}1stIn"]/row[f"{o}svpt"])

    # % очков на первой подаче
    feats["first_won_diff"] = (row[f"{p}1stWon"]/row[f"{p}1stIn"]) - \
                              (row[f"{o}1stWon"]/row[f"{o}1stIn"])

    def safe_div(num, denom):
        return num / denom if denom != 0 else 0.0

    # процент выигранных вторых подач
    feats["second_won_diff"] = safe_div(row[f"{p}2ndWon"], row[f"{p}svpt"] - row[f"{p}1stIn"]) - \
                            safe_div(row[f"{o}2ndWon"], row[f"{o}svpt"] - row[f"{o}1stIn"])


    # % отбитых брейков
    # процент сохранённых брейкпоинтов (bpSaved / bpFaced)
    def safe_bp_ratio(saved, faced):
        return saved / faced if faced > 0 else 0.0

    feats["bp_saved_diff"] = safe_bp_ratio(row[f"{p}bpSaved"], row[f"{p}bpFaced"]) - \
                            safe_bp_ratio(row[f"{o}bpSaved"], row[f"{o}bpFaced"])

    # сколько было брейкпоинтов
    feats["bp_diff"] = row[f"{p}bpFaced"] - row[f"{o}bpFaced"]

    # 🎯 разница по общему количеству очков (Total points won)
    player_points = row[f"{p}1stWon"] + row[f"{p}2ndWon"]
    opp_points = row[f"{o}1stWon"] + row[f"{o}2ndWon"]
    feats["points_diff"] = player_points - opp_points

    return feats


# Построим историю матчей для каждого игрока
player_history = {}

for idx, row in df.iterrows():
    for pl in ["winner", "loser"]:
        pid = row[f"{pl}_id"]
        feats = extract_relative_features(row, pl)
        feats["date"] = row["tourney_date"]
        feats["result"] = 1 if pl == "winner" else 0  # результат матча для этого игрока
        
        if pid not in player_history:
            player_history[pid] = []
        player_history[pid].append(feats)


# Dataset для Siamese LSTM
class TennisSiameseDataset(Dataset):
    def __init__(self, matches_df, player_history, n_matches=5):
        self.matches_df = matches_df
        self.player_history = player_history
        self.n_matches = n_matches
        self.samples = []
        
        for idx, row in matches_df.iterrows():
            p1, p2 = row["winner_id"], row["loser_id"]
            label = 1  # победа игрока 1 (winner)
            
            # История
            hist1 = [h for h in player_history[p1] if h["date"] < row["tourney_date"]]
            hist2 = [h for h in player_history[p2] if h["date"] < row["tourney_date"]]
            
            if len(hist1) >= self.n_matches and len(hist2) >= self.n_matches:
                # победитель (label=1)
                self.samples.append((p1, p2, row["tourney_date"], 1, row))
                # проигравший (label=0) — зеркально
                self.samples.append((p2, p1, row["tourney_date"], 0, row))
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        p1, p2, date, label, row = self.samples[idx]
        hist1 = [h for h in self.player_history[p1] if h["date"] < date][-self.n_matches:]
        hist2 = [h for h in self.player_history[p2] if h["date"] < date][-self.n_matches:]
            
        def to_vec_list(hist):
            feat_names = [k for k in hist[0].keys() if k not in ["date","result"]]
            arr = []
            for h in hist:
                arr.append([h[f] for f in feat_names])
            # паддинг, если меньше n_matches
            while len(arr) < self.n_matches:
                arr.insert(0, [0]*len(feat_names))
            return np.array(arr, dtype=np.float32)
            
        X1 = to_vec_list(hist1)
        X2 = to_vec_list(hist2)

        # 🔑 сразу преобразуем в тензоры
        X1 = torch.tensor(X1, dtype=torch.float32)
        X2 = torch.tensor(X2, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.float32)  # если BCEWithLogitsLoss
        # label = torch.tensor(label, dtype=torch.long)   # если CrossEntropyLoss
            
        # 🎾 матчевые признаки
        match_feats = torch.tensor([
            row["winner_rank"] if p1 == row["winner_id"] else row["loser_rank"],
            row["loser_rank"] if p2 == row["loser_id"] else row["winner_rank"],
            int(row["winner_hand"] == row["loser_hand"])
        ], dtype=torch.float32) 
            
        return X1, X2, label, match_feats


# Создаем датасет
dataset = TennisSiameseDataset(df, player_history, n_matches=10)
len(dataset)


42242

In [21]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split

# === 1. Разделение на train/test ===
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

all_train_features = []
all_match_features = []

for i in range(len(train_dataset)):
    X1, X2, _, match_feats = train_dataset[i]
    all_train_features.append(X1)
    all_train_features.append(X2)
    all_match_features.append(match_feats)

# Конкатим временные признаки
all_train_features = torch.cat(all_train_features, dim=0)  # [N*seq_len, n_feats]
all_match_features = torch.stack(all_match_features, dim=0)  # [N, n_match_feats]

# Считаем mean/std
mean_seq = all_train_features.mean(dim=0, keepdim=True)
std_seq = all_train_features.std(dim=0, keepdim=True)

mean_match = all_match_features.mean(dim=0, keepdim=True)
std_match = all_match_features.std(dim=0, keepdim=True)

# Нормализация
def normalize_dataset(ds, mean_seq, std_seq, mean_match, std_match):
    normed = []
    for i in range(len(ds)):
        X1, X2, y, match_feats = ds[i]
        X1 = (X1 - mean_seq) / (std_seq + 1e-10)
        X2 = (X2 - mean_seq) / (std_seq + 1e-10)
        match_feats = (match_feats - mean_match) / (std_match + 1e-10)
        normed.append((X1, X2, y, match_feats))
    return normed

train_data = normalize_dataset(train_dataset, mean_seq, std_seq, mean_match, std_match)
test_data  = normalize_dataset(test_dataset, mean_seq, std_seq, mean_match, std_match)


train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


In [45]:
# === 2. Модель Siamese LSTM ===
class SiameseLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1, match_dim=0, use_match_feats=True):
        super().__init__()
        self.use_match_feats = use_match_feats

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        #self.lstm = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)

        if self.use_match_feats:
            self.fc1 = nn.Linear(hidden_dim*2 + match_dim, 1)
        else:
            self.fc1 = nn.Linear(hidden_dim*2, 1)

        #self.fc2 = nn.Linear(64, 1)

    def forward_once(self, x):
        _, (h, _) = self.lstm(x)
        #_, h = self.lstm(x)
        return h[-1]

    def forward(self, x1, x2, match_feats=None):
        h1 = self.forward_once(x1)
        h2 = self.forward_once(x2)

        if self.use_match_feats:
            if match_feats.dim() == 3:
                match_feats = match_feats.squeeze(1)
            combined = torch.cat([h1, h2, match_feats], dim=1)
            #x = torch.relu(self.fc1(combined))
            return self.fc1(combined).squeeze(-1)
        else:
            combined = torch.cat([h1, h2], dim=1)
            return self.fc1(combined).squeeze(-1)



# === 3. Инициализация модели ===
sample_X1, sample_X2, _, sample_match_feats = dataset[0]
input_dim = sample_X1.shape[1]
match_dim = sample_match_feats.shape[0]

model = SiameseLSTM(
    input_dim=input_dim,
    hidden_dim=32,
    match_dim=match_dim,
    use_match_feats=True  # match_dim можно оставить 0
)
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5, verbose=True
)

In [41]:
import torch
import torch.nn as nn

class SiameseTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim=32, num_heads=2, num_layers=1, match_dim=3, max_len=30):
        super().__init__()
        self.hidden_dim = hidden_dim

        # линейная проекция признаков в пространство трансформера
        self.input_proj = nn.Linear(input_dim, hidden_dim)

        # CLS токен
        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))

        # позиционный энкодинг (обучаемый)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len+1, hidden_dim))

        # энкодер
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim*2,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # финальные слои
        self.norm = nn.LayerNorm(hidden_dim)
        self.fc = nn.Linear(hidden_dim*2 + match_dim, 1)

    def encode(self, x):
        # x: [batch, seq_len, input_dim]
        b, seq_len, _ = x.size()

        # проекция в скрытое пространство
        x = self.input_proj(x)

        # CLS токен
        cls_tokens = self.cls_token.expand(b, -1, -1)  # [b, 1, hidden_dim]
        x = torch.cat([cls_tokens, x], dim=1)

        # позиционный энкодинг
        x = x + self.pos_embedding[:, :x.size(1), :]

        # трансформер
        out = self.transformer(x)

        # берём CLS
        return self.norm(out[:, 0, :])  # [b, hidden_dim]

    def forward(self, x1, x2, match_feats):
        z1 = self.encode(x1)
        z2 = self.encode(x2)

        if match_feats.dim() == 3:
            match_feats = match_feats.squeeze(1)

        z = torch.cat([z1, z2, match_feats], dim=1)
        return self.fc(z).squeeze(-1)
        
# === 3. Инициализация модели ===
sample_X1, sample_X2, _, sample_match_feats = dataset[0]
input_dim = sample_X1.shape[1]
match_dim = sample_match_feats.shape[0]
model = SiameseTransformer(
    input_dim=input_dim
)
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5, verbose=True
)


In [52]:
import math
import torch
import torch.nn as nn

def sinusoidal_positional_encoding(max_len, hidden_dim):
    pe = torch.zeros(max_len, hidden_dim)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * -(math.log(10000.0) / hidden_dim))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe.unsqueeze(0)  # [1, max_len, hidden_dim]


class SiameseTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim=32, num_heads=2, num_layers=1, match_dim=3, max_len=30):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.max_len = max_len

        # линейная проекция признаков
        self.input_proj = nn.Linear(input_dim, hidden_dim)

        # фиксированные sin/cos позиционные эмбеддинги
        pe = sinusoidal_positional_encoding(max_len, hidden_dim)
        self.register_buffer("pos_embedding", pe, persistent=False)

        # энкодер
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim * 2,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # финальные слои
        self.norm = nn.LayerNorm(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2 + match_dim, 1)

    def encode(self, x):
        # x: [batch, seq_len, input_dim]
        b, seq_len, _ = x.size()

        # проекция
        x = self.input_proj(x)

        # позиционный энкодинг
        pos_emb = self.pos_embedding[:, :seq_len, :]
        x = x + pos_emb

        # трансформер
        out = self.transformer(x)

        # mean pooling по токенам
        pooled, _ = out.max(dim=1)

        return self.norm(pooled)  # [b, hidden_dim]

    def forward(self, x1, x2, match_feats):
        z1 = self.encode(x1)
        z2 = self.encode(x2)

        if match_feats.dim() == 3:
            match_feats = match_feats.squeeze(1)

        z = torch.cat([z1, z2, match_feats], dim=1)
        return self.fc(z).squeeze(-1)

model = SiameseTransformer(
    input_dim=input_dim
)
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5, verbose=True
)


In [None]:
# === 4. Обучение ===
for epoch in range(100):
    model.train()
    total_loss = 0
    for X1, X2, y, match_feats in train_loader:
        X1, X2, y, match_feats = X1, X2, y.float(), match_feats
        optimizer.zero_grad()
        outputs = model(X1, X2, match_feats)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

    # Validation loss
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X1, X2, y, match_feats in test_loader:
            outputs = model(X1, X2, match_feats)
            loss = criterion(outputs, y.float())
            val_loss += loss.item()
    avg_val_loss = val_loss / len(test_loader)

    #scheduler.step(avg_val_loss)  # 🔑 обновляем lr

    print(
        f"Epoch {epoch+1:02d} | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Loss: {avg_val_loss:.4f} | "
        f"LR: {optimizer.param_groups[0]['lr']:.5f}"
    )

Epoch 1, Loss: 0.6737
Epoch 01 | Train Loss: 0.6737 | Val Loss: 0.6655 | LR: 0.00100
Epoch 2, Loss: 0.6614
Epoch 02 | Train Loss: 0.6614 | Val Loss: 0.6587 | LR: 0.00100
Epoch 3, Loss: 0.6603
Epoch 03 | Train Loss: 0.6603 | Val Loss: 0.6559 | LR: 0.00100
Epoch 4, Loss: 0.6589
Epoch 04 | Train Loss: 0.6589 | Val Loss: 0.6549 | LR: 0.00100
Epoch 5, Loss: 0.6576
Epoch 05 | Train Loss: 0.6576 | Val Loss: 0.6560 | LR: 0.00100
Epoch 6, Loss: 0.6560
Epoch 06 | Train Loss: 0.6560 | Val Loss: 0.6547 | LR: 0.00100
Epoch 7, Loss: 0.6556
Epoch 07 | Train Loss: 0.6556 | Val Loss: 0.6559 | LR: 0.00100
Epoch 8, Loss: 0.6539
Epoch 08 | Train Loss: 0.6539 | Val Loss: 0.6553 | LR: 0.00100
Epoch 9, Loss: 0.6523
Epoch 09 | Train Loss: 0.6523 | Val Loss: 0.6549 | LR: 0.00100
Epoch 10, Loss: 0.6511
Epoch 10 | Train Loss: 0.6511 | Val Loss: 0.6520 | LR: 0.00100
Epoch 11, Loss: 0.6486
Epoch 11 | Train Loss: 0.6486 | Val Loss: 0.6543 | LR: 0.00100
Epoch 12, Loss: 0.6454
Epoch 12 | Train Loss: 0.6454 | Val Loss

In [None]:
#LSTM
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X1, X2, y, match_feats in test_loader:
        outputs = model(X1, X2, match_feats)
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).long()
        correct += (preds == y).sum().item()
        total += y.size(0)

print(f"Test Accuracy: {correct/total:.2%}")

Test Accuracy: 74.39%


In [54]:
#Transformers
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X1, X2, y, match_feats in test_loader:
        outputs = model(X1, X2, match_feats)
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).long()
        correct += (preds == y).sum().item()
        total += y.size(0)

print(f"Test Accuracy: {correct/total:.2%}")

Test Accuracy: 64.61%
