In [11]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# === 1. 讀取資料 ===
winners = pd.read_csv('./data/winners.csv')
drivers = pd.read_csv('./data/drivers_updated.csv')
laps = pd.read_csv('./data/fastest_laps_updated.csv')
teams = pd.read_csv('./data/teams_updated.csv')

# === 2. 欄位整理與年份解析 ===
winners['year'] = pd.to_datetime(winners['Date']).dt.year

# === 3. 合併資料 ===
# 合併 driver 資訊 (Winner, Car, year)
df = winners.merge(
    drivers,
    left_on=['Winner', 'Car', 'year'],
    right_on=['Driver', 'Car', 'year'],
    how='left',
    suffixes=('', '_driver')
)

# 合併最快圈速 (Grand Prix, Winner/Driver, Car, year)
df = df.merge(
    laps,
    left_on=['Grand Prix', 'Winner', 'Car', 'year'],
    right_on=['Grand Prix', 'Driver', 'Car', 'year'],
    how='left',
    suffixes=('', '_lap')
)

# 合併車隊資訊 (Car, year)
df = df.merge(
    teams,
    left_on=['Car', 'year'],
    right_on=['Team', 'year'],
    how='left',
    suffixes=('', '_team')
)


In [12]:
# === 4. Label Encoding 與特徵工程 ===
le_driver = LabelEncoder()
le_car = LabelEncoder()
le_gp = LabelEncoder()
le_team = LabelEncoder()

df['winner_enc'] = le_driver.fit_transform(df['Winner'].astype(str))
df['car_enc'] = le_car.fit_transform(df['Car'].astype(str))
df['gp_enc'] = le_gp.fit_transform(df['Grand Prix'].astype(str))
df['team_enc'] = le_team.fit_transform(df['Team'].astype(str)) if 'Team' in df else 0

# 處理最快圈速時間（mm:ss.xxx 轉 float秒）
def lap_time_to_seconds(x):
    try:
        if pd.isnull(x):
            return 0.0
        if isinstance(x, float) or isinstance(x, int):
            return float(x)
        m, s = str(x).split(':')
        return float(m)*60 + float(s)
    except:
        return 0.0
df['fastestLapTime_sec'] = df['Time'].apply(lap_time_to_seconds) if 'Time' in df else 0.0

# 處理 Laps, PTS 缺失值與數值轉換
df['Laps'] = pd.to_numeric(df['Laps'], errors='coerce').fillna(0)
df['PTS'] = pd.to_numeric(df['PTS'], errors='coerce').fillna(0)

# === 5. 選擇特徵與標準化 ===
feature_cols = ['car_enc', 'gp_enc', 'year', 'Laps', 'PTS', 'fastestLapTime_sec']
feature_cols = [col for col in feature_cols if col in df.columns]
X = df[feature_cols].fillna(0).values
y = df['winner_enc'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

# === 6. 製作 LSTM 時序資料 ===
def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i + time_steps])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 5
X_seq, y_seq = create_sequences(X, y, time_steps)
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42, shuffle=False)

# === 7. LSTM Model 定義 ===
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

input_size = X_train.shape[2]
hidden_size = 64
num_layers = 2
num_classes = len(np.unique(y))
model = LSTMModel(input_size, hidden_size, num_layers, num_classes)

# === 8. 資料格式轉換與訓練準備 ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

X_train_torch = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_torch = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_torch = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_torch = torch.tensor(y_test, dtype=torch.long).to(device)

# === 9. 訓練迴圈 ===
num_epochs = 15000
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_torch)
    loss = criterion(outputs, y_train_torch)
    loss.backward()
    optimizer.step()
    # 驗證
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_torch)
        test_loss = criterion(test_outputs, y_test_torch)
        _, predicted = torch.max(test_outputs, 1)
        acc = (predicted == y_test_torch).float().mean()
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Test Acc: {acc:.4f}")

# === 10. 預測與車手名稱還原 ===
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_torch)
    _, predicted = torch.max(test_outputs, 1)
    predicted_driver = le_driver.inverse_transform(predicted.cpu().numpy())
    true_driver = le_driver.inverse_transform(y_test)
    print("前10筆預測 vs. 真實：")
    for i in range(10):
        print(f"Predicted Winner: {predicted_driver[i]}, True Winner: {true_driver[i]}")


Epoch [1/15000], Train Loss: 4.7619, Test Acc: 0.0000
Epoch [2/15000], Train Loss: 4.7545, Test Acc: 0.0000
Epoch [3/15000], Train Loss: 4.7474, Test Acc: 0.0000
Epoch [4/15000], Train Loss: 4.7412, Test Acc: 0.0000
Epoch [5/15000], Train Loss: 4.7333, Test Acc: 0.0000
Epoch [6/15000], Train Loss: 4.7246, Test Acc: 0.0000
Epoch [7/15000], Train Loss: 4.7155, Test Acc: 0.0000
Epoch [8/15000], Train Loss: 4.7082, Test Acc: 0.0000
Epoch [9/15000], Train Loss: 4.6985, Test Acc: 0.0000
Epoch [10/15000], Train Loss: 4.6859, Test Acc: 0.0000
Epoch [11/15000], Train Loss: 4.6717, Test Acc: 0.0045
Epoch [12/15000], Train Loss: 4.6619, Test Acc: 0.0271
Epoch [13/15000], Train Loss: 4.6452, Test Acc: 0.0769
Epoch [14/15000], Train Loss: 4.6268, Test Acc: 0.0905
Epoch [15/15000], Train Loss: 4.6040, Test Acc: 0.0950
Epoch [16/15000], Train Loss: 4.5827, Test Acc: 0.0769
Epoch [17/15000], Train Loss: 4.5565, Test Acc: 0.0498
Epoch [18/15000], Train Loss: 4.5241, Test Acc: 0.0181
Epoch [19/15000], T

In [13]:
print(df['Winner'].value_counts())
print(df[feature_cols].isnull().sum())
print(df[feature_cols].describe())


Winner
Lewis  Hamilton         103
Michael  Schumacher      91
Max  Verstappen          58
Sebastian  Vettel        53
Alain  Prost             51
                       ... 
Carlos  Pace              1
Jo  Bonnier               1
Rodger  Ward              1
Jimmy  Bryan              1
Lando  Norris             1
Name: count, Length: 115, dtype: int64
car_enc               0
gp_enc                0
year                  0
Laps                  0
PTS                   0
fastestLapTime_sec    0
dtype: int64
           car_enc       gp_enc         year         Laps          PTS  \
count  1110.000000  1110.000000  1110.000000  1110.000000  1110.000000   
mean     33.131532    22.135135  1992.092793    64.479279   132.987838   
std      16.245046    13.908361    20.443139    20.516791   136.271880   
min       0.000000     0.000000  1950.000000     0.000000     0.000000   
25%      17.000000    10.000000  1976.000000    54.000000    42.000000   
50%      36.000000    19.000000  1994.000000 