In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import xgboost as xgb
import gradio as gr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ==== 1. 讀取資料 ====
winners = pd.read_csv('./data/winners.csv')
drivers = pd.read_csv('./data/drivers_updated.csv')
laps = pd.read_csv('./data/fastest_laps_updated.csv')
teams = pd.read_csv('./data/teams_updated.csv')

winners['year'] = pd.to_datetime(winners['Date']).dt.year

# ==== 2. 合併資料（以 Winner+Car+year 為主鍵）====
df = winners.merge(drivers, left_on=['Winner', 'Car', 'year'], right_on=['Driver', 'Car', 'year'], how='left')
df = df.merge(laps, left_on=['Grand Prix', 'Winner', 'Car', 'year'], right_on=['Grand Prix', 'Driver', 'Car', 'year'], how='left')
df = df.merge(teams, left_on=['Car', 'year'], right_on=['Team', 'year'], how='left')

# ==== 3. 簡單特徵設計 ====
main_features = [
    'Grid',          # 起跑位
    'Laps',          # 完成圈數
    'Time',          # 比賽時間
    'Position',      # 當站名次
]
label_col = 'Winner'

# 保證有所有 main_features 欄位
for col in main_features:
    if col not in df.columns:
        df[col] = 0
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Label 編碼
le_label = LabelEncoder()
df['Winner_enc'] = le_label.fit_transform(df[label_col].astype(str))

# ==== 4. 過去N場比賽資料做序列特徵 ====
def get_driver_seq_features(df, driver, year, n_seq=3):
    df_hist = df[(df[label_col] == driver) & (df['year'] < year)].sort_values('year', ascending=False)
    if len(df_hist) >= n_seq:
        feats = df_hist.head(n_seq)[main_features].values
    else:
        # 不足補0
        feats = np.vstack([df_hist[main_features].values, np.zeros((n_seq-len(df_hist), len(main_features)))])
    return feats

X_seq, X_static, y = [], [], []
n_seq = 3
for i, row in df.iterrows():
    X_seq.append(get_driver_seq_features(df, row[label_col], row['year'], n_seq))
    X_static.append([row['year']])
    y.append(row['Winner_enc'])
X_seq = np.stack(X_seq)
X_static = np.array(X_static)
y = np.array(y)

# ==== 5. 過濾僅拿過1次冠軍的人，重建label ====
vc = pd.Series(y).value_counts()
valid_labels = vc[vc > 1].index.tolist()
mask = [yy in valid_labels for yy in y]
X_seq_valid = X_seq[mask]
X_static_valid = X_static[mask]
y_valid_raw = y[mask]
le_valid = LabelEncoder()
y_valid = le_valid.fit_transform(y_valid_raw)

# ==== 6. LSTM 特徵抽取 ====
class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_size, hidden_size=16, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return h_n[-1]  # (batch, hidden_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lstm = LSTMFeatureExtractor(input_size=len(main_features)).to(device)
with torch.no_grad():
    lstm_features = lstm(torch.tensor(X_seq_valid, dtype=torch.float32).to(device)).cpu().numpy()

X_all_valid = np.hstack([lstm_features, X_static_valid])
X_train, X_test, y_train, y_test = train_test_split(
    X_all_valid, y_valid, test_size=0.2, random_state=42, stratify=y_valid
)

# ==== 7. XGBoost 訓練 ====
model = xgb.XGBClassifier(
    n_estimators=120, learning_rate=0.1, max_depth=4,
    objective='multi:softprob', num_class=len(le_valid.classes_)
)
model.fit(X_train, y_train)

# ==== 8. 分析/可視化 ====
def top3_predict_probs(X):
    probs = model.predict_proba(X)
    top3 = np.argsort(probs[0])[::-1][:3]
    names = le_valid.inverse_transform(top3)
    prob_vals = probs[0][top3]
    name_map = {i: le_label.inverse_transform([int(le_valid.classes_[i])])[0] for i in top3}
    return [(name_map[i], float(prob_vals[j])) for j, i in enumerate(top3)]

def predict_and_analysis(driver_name, year, grid, laps, time, position):
    # UI 輸入
    feats_seq = get_driver_seq_features(df, driver_name, int(year), n_seq)
    X_seq_input = torch.tensor([feats_seq], dtype=torch.float32).to(device)
    with torch.no_grad():
        lstm_feat = lstm(X_seq_input).cpu().numpy()
    X_static_input = np.array([[year]])
    X_input = np.hstack([lstm_feat, X_static_input])
    probs = model.predict_proba(X_input)
    top3 = np.argsort(probs[0])[::-1][:3]
    names = le_valid.inverse_transform(top3)
    probs_top = probs[0][top3]
    # 原始標籤對應
    orig_names = [le_label.inverse_transform([int(le_valid.classes_[i])])[0] for i in top3]
    txt = "\n".join([f"{orig_names[i]}: 機率 {probs_top[i]:.2%}" for i in range(3)])
    return f"預測奪冠機率：\n{txt}"

# ==== 9. Gradio WebUI ====
with gr.Blocks() as demo:
    gr.Markdown("# F1 冠軍預測（LSTM + XGBoost）")
    with gr.Row():
        driver = gr.Dropdown(list(le_label.classes_), label="車手")
        year = gr.Number(label="年份", value=2024)
        grid = gr.Number(label="起跑位", value=1)
        laps = gr.Number(label="圈數", value=56)
        time = gr.Number(label="比賽時間（秒）", value=5400)
        position = gr.Number(label="名次", value=1)
    btn = gr.Button("預測")
    output = gr.Textbox(label="預測結果")
    btn.click(
        predict_and_analysis, 
        [driver, year, grid, laps, time, position], 
        output
    )

    # 加上 Top-10 預測 vs 真實分析
    def show_analysis():
        idx = np.random.choice(len(X_test), 10, replace=False)
        Xsub = X_test[idx]
        ysub = y_test[idx]
        pred_sub = model.predict(Xsub)
        orig = [le_label.inverse_transform([int(le_valid.classes_[yy])])[0] for yy in ysub]
        pred = [le_label.inverse_transform([int(le_valid.classes_[yy])])[0] for yy in pred_sub]
        txt = "\n".join([f"第{i+1}筆：預測 {p} | 真實 {t}" for i, (p, t) in enumerate(zip(pred, orig))])
        return txt

    with gr.Row():
        gr.Markdown("## 驗證集 10 筆：預測 vs. 真實")
        analysis_btn = gr.Button("隨機取10筆驗證")
        analysis_out = gr.Textbox()
        analysis_btn.click(show_analysis, [], analysis_out)

demo.launch()


* Running on local URL:  http://127.0.0.1:7907

To create a public link, set `share=True` in `launch()`.




In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import top_k_accuracy_score
import gradio as gr

# === 1. 定義多任務 LSTM 模型 ===
class MultiTaskLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes_driver, num_classes_team, num_classes_pos):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.driver_head = nn.Linear(hidden_size, num_classes_driver)
        self.team_head = nn.Linear(hidden_size, num_classes_team)
        self.pos_head = nn.Linear(hidden_size, num_classes_pos)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        h = h_n[-1]
        return self.driver_head(h), self.team_head(h), self.pos_head(h)

# === 2. 載入與處理資料 ===
winners = pd.read_csv('./data/winners.csv')
drivers = pd.read_csv('./data/drivers_updated.csv')
laps = pd.read_csv('./data/fastest_laps_updated.csv')
teams = pd.read_csv('./data/teams_updated.csv')

winners['year'] = pd.to_datetime(winners['Date']).dt.year
df = winners.merge(drivers, left_on=['Winner', 'Car', 'year'], right_on=['Driver', 'Car', 'year'], how='left')
df = df.merge(laps, left_on=['Grand Prix', 'Winner', 'Car', 'year'], right_on=['Grand Prix', 'Driver', 'Car', 'year'], how='left')
df = df.merge(teams, left_on=['Car', 'year'], right_on=['Team', 'year'], how='left')

for col in ['PTS_team', 'Time_lap', 'Pos_team']:
    df[col] = df[col].fillna(0)

main_features = ['PTS', 'Grid', 'Laps', 'Time', 'Position', 'PTS_team', 'Pos_team']
for col in main_features:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

df['Winner_enc'] = df['Winner'].astype(str)
df['Team_enc'] = df['Team'].astype(str)
df['Position_enc'] = df['Position'].astype(str)

multi_driver = df['Winner_enc'].value_counts()
multi_driver = multi_driver[multi_driver > 1].index
multi_team = df['Team_enc'].value_counts()
multi_team = multi_team[multi_team > 1].index
multi_position = df['Position_enc'].value_counts()
multi_position = multi_position[multi_position > 1].index

df_filtered = df[
    (df['Winner_enc'].isin(multi_driver)) &
    (df['Team_enc'].isin(multi_team)) &
    (df['Position_enc'].isin(multi_position))
].copy()

n_seq = 3
def get_driver_seq_features(df, driver, year, n_seq=3):
    hist = df[(df['Winner_enc'] == driver) & (df['year'] < year)].sort_values('year', ascending=False)
    feats = hist.head(n_seq)[main_features].values
    if len(feats) < n_seq:
        pad = np.zeros((n_seq - len(feats), len(main_features)))
        feats = np.vstack([feats, pad])
    return feats

X_seq, X_static, y_driver, y_team, y_position = [], [], [], [], []
for idx, row in df_filtered.iterrows():
    seq_feats = get_driver_seq_features(df_filtered, row['Winner_enc'], row['year'], n_seq)
    X_seq.append(seq_feats)
    X_static.append([row['year']])
    y_driver.append(row['Winner_enc'])
    y_team.append(row['Team_enc'])
    y_position.append(row['Position_enc'])

X_seq = np.stack(X_seq)
X_static = np.array(X_static)
y_driver = np.array(y_driver)
y_team = np.array(y_team)
y_position = np.array(y_position)

le_driver = LabelEncoder()
y_driver_enc = le_driver.fit_transform(y_driver)
le_team = LabelEncoder()
y_team_enc = le_team.fit_transform(y_team)
le_position = LabelEncoder()
y_position_enc = le_position.fit_transform(y_position)

# === 3. 模型訓練與載入 ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiTaskLSTMClassifier(
    input_size=len(main_features),
    hidden_size=64,
    num_classes_driver=len(le_driver.classes_),
    num_classes_team=len(le_team.classes_),
    num_classes_pos=len(le_position.classes_)
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

X_tensor = torch.tensor(X_seq, dtype=torch.float32).to(device)
y_driver_tensor = torch.tensor(y_driver_enc, dtype=torch.long).to(device)
y_team_tensor = torch.tensor(y_team_enc, dtype=torch.long).to(device)
y_pos_tensor = torch.tensor(y_position_enc, dtype=torch.long).to(device)

model_path = "./model/lstm_multi_task.pt"
os.makedirs(os.path.dirname(model_path), exist_ok=True)

if os.path.exists(model_path):
    print("🚀 載入已訓練模型")
    model.load_state_dict(torch.load(model_path))
else:
    for epoch in range(300):
        model.train()
        optimizer.zero_grad()
        out_driver, out_team, out_pos = model(X_tensor)
        loss = criterion(out_driver, y_driver_tensor) + criterion(out_team, y_team_tensor) + criterion(out_pos, y_pos_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 50 == 0:
            with torch.no_grad():
                probs = out_driver.softmax(dim=1).cpu().numpy()
                acc1 = top_k_accuracy_score(y_driver_tensor.cpu(), probs, k=1)
                acc5 = top_k_accuracy_score(y_driver_tensor.cpu(), probs, k=5)
            print(f"Epoch {epoch+1}/300 | Loss: {loss.item():.4f} | Top-1: {acc1:.3f} | Top-5: {acc5:.3f}")

    torch.save(model.state_dict(), model_path)
    print("✅ 模型儲存完成")

# === 4. 推論函數 ===
driver_to_team_map = df_filtered[['Winner_enc', 'Team_enc']].drop_duplicates().set_index('Winner_enc')['Team_enc'].to_dict()

def predict_position(year, pts, grid, laps, time, position, pts_team, pos_team):
    input_feat = np.array([[pts, grid, laps, time, position, pts_team, pos_team]])
    input_seq = np.repeat(input_feat, n_seq, axis=0)[np.newaxis, ...]
    input_seq_tensor = torch.tensor(input_seq, dtype=torch.float32).to(device)

    model.eval()
    with torch.no_grad():
        out_driver, out_team, out_pos = model(input_seq_tensor)
        probs_driver = torch.softmax(out_driver, dim=1).cpu().numpy()[0]
        probs_team = torch.softmax(out_team, dim=1).cpu().numpy()[0]
        probs_pos = torch.softmax(out_pos, dim=1).cpu().numpy()[0]

    top5_driver_idx = np.argsort(probs_driver)[::-1][:5]
    top5_team_idx = np.argsort(probs_team)[::-1][:5]
    top5_pos_idx = np.argsort(probs_pos)[::-1][:5]

    top5_driver_names = le_driver.inverse_transform(top5_driver_idx)
    top5_team_names = le_team.inverse_transform(top5_team_idx)
    try:
        top5_pos_labels = le_position.inverse_transform(top5_pos_idx)
    except ValueError:
        top5_pos_labels = [f"未知({i})" for i in top5_pos_idx]

    output_lines = []
    for i in range(5):
        driver = top5_driver_names[i]
        team = driver_to_team_map.get(driver, "未知車隊")
        pos = top5_pos_labels[i] if i < len(top5_pos_labels) else "未知"
        output_lines.append(
            f"排名預測 #{i+1}：\n"
            f"駕駛：{driver} (機率 {probs_driver[top5_driver_idx[i]]:.2%})\n"
            f"車隊：{team} (機率 {probs_team[top5_team_idx[i]]:.2%})\n"
            f"排名：{pos} (機率 {probs_pos[top5_pos_idx[i]]:.2%})\n"
            "-------------------------"
        )
    return "\n".join(output_lines)

# === 5. Gradio UI ===
with gr.Blocks() as demo:
    gr.Markdown("# F1 冠軍及排名預測 (LSTM 多任務)")
    with gr.Row():
        year_input = gr.Number(label="年份", value=2024)
        pts_input = gr.Number(label="PTS", value=25)
        grid_input = gr.Number(label="起跑位", value=1)
        laps_input = gr.Number(label="圈數", value=56)
        time_input = gr.Number(label="比賽時間（秒）", value=5400)
        position_input = gr.Number(label="名次", value=1)
        pts_team_input = gr.Number(label="車隊分數", value=400)
        pos_team_input = gr.Number(label="車隊名次", value=1)
    predict_btn = gr.Button("預測")
    output = gr.Textbox(label="預測結果", lines=20)
    predict_btn.click(
        predict_position,
        inputs=[year_input, pts_input, grid_input, laps_input, time_input, position_input, pts_team_input, pos_team_input],
        outputs=output
    )
demo.launch()


KeyError: 'PTS_team'