In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# === 1. 讀取所有檔案 ===
winners = pd.read_csv('winners.csv')
teams = pd.read_csv('teams_updated.csv')
drivers = pd.read_csv('drivers_updated.csv')
laps = pd.read_csv('fastest_laps_updated.csv')

# 檢查欄位與資料型態
print(winners.head())
print(teams.head())
print(drivers.head())
print(laps.head())

# === 2. 資料整合與特徵工程 ===
# 合併 team 名稱，driver 名稱等到 winners，假設關聯欄位名稱相符
df = winners.merge(drivers, left_on='driverId', right_on='driverId', how='left')
df = df.merge(teams, left_on='constructorId', right_on='constructorId', how='left')
df = df.merge(laps, left_on=['raceId', 'driverId'], right_on=['raceId', 'driverId'], how='left')

# 可用特徵舉例
# ['year', 'round', 'circuitId', 'constructorId', 'driverId', 'grid', 'fastestLapTime', 'team_name', 'driver_name']
# target: driverId（也可以用 driver 名字）

# LabelEncoding (string→int)，避免模型吃不下文字型
le_driver = LabelEncoder()
le_team = LabelEncoder()
le_circuit = LabelEncoder()
le_fastestLapTime = LabelEncoder() # 有時候圈速會是字串，轉數字

df['driver_enc'] = le_driver.fit_transform(df['driverId'].astype(str))
df['team_enc'] = le_team.fit_transform(df['constructorId'].astype(str))
df['circuit_enc'] = le_circuit.fit_transform(df['circuitId'].astype(str))

# fastestLapTime: 若為字串格式如 mm:ss.xxx 需轉 float（秒數），否則用空值補0
def lap_time_to_seconds(x):
    try:
        if pd.isnull(x):
            return 0.0
        m, s = x.split(':')
        return float(m)*60 + float(s)
    except:
        return 0.0
df['fastestLapTime_sec'] = df['fastestLapTime'].apply(lap_time_to_seconds)

# 特徵選擇（可自行增減）
feature_cols = ['team_enc', 'circuit_enc', 'grid', 'year', 'fastestLapTime_sec']
X = df[feature_cols].values
y = df['driver_enc'].values

# 標準化
scaler = StandardScaler()
X = scaler.fit_transform(X)

# === 3. 組成 LSTM 所需時序資料 ===
def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i + time_steps])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 5
X_seq, y_seq = create_sequences(X, y, time_steps)
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42, shuffle=False)

In [None]:
# === 4. 建立 LSTM 模型 ===
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

input_size = X_train.shape[2]
hidden_size = 64
num_layers = 2
num_classes = len(np.unique(y))
model = LSTMModel(input_size, hidden_size, num_layers, num_classes)

# === 5. 轉換資料格式並準備訓練 ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

X_train_torch = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_torch = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_torch = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_torch = torch.tensor(y_test, dtype=torch.long).to(device)

# === 6. 訓練迴圈 ===
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_torch)
    loss = criterion(outputs, y_train_torch)
    loss.backward()
    optimizer.step()
    # 驗證
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_torch)
        test_loss = criterion(test_outputs, y_test_torch)
        _, predicted = torch.max(test_outputs, 1)
        acc = (predicted == y_test_torch).float().mean()
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Test Acc: {acc:.4f}")

# === 7. 預測與解碼 ===
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_torch)
    _, predicted = torch.max(test_outputs, 1)
    # 解碼成 driverId 或 driver name
    predicted_driver = le_driver.inverse_transform(predicted.cpu().numpy())
    true_driver = le_driver.inverse_transform(y_test)
    print("前10筆預測 vs. 真實：")
    for i in range(10):
        print(f"Predicted driverId: {predicted_driver[i]}, True driverId: {true_driver[i]}")
