In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import xgboost as xgb
import gradio as gr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ==== 1. 讀取資料 ====
winners = pd.read_csv('./data/winners.csv')
drivers = pd.read_csv('./data/drivers_updated.csv')
laps = pd.read_csv('./data/fastest_laps_updated.csv')
teams = pd.read_csv('./data/teams_updated.csv')

winners['year'] = pd.to_datetime(winners['Date']).dt.year

# ==== 2. 合併資料（以 Winner+Car+year 為主鍵）====
df = winners.merge(drivers, left_on=['Winner', 'Car', 'year'], right_on=['Driver', 'Car', 'year'], how='left')
df = df.merge(laps, left_on=['Grand Prix', 'Winner', 'Car', 'year'], right_on=['Grand Prix', 'Driver', 'Car', 'year'], how='left')
df = df.merge(teams, left_on=['Car', 'year'], right_on=['Team', 'year'], how='left')

main_features = ['Grid', 'Laps', 'Time', 'Position']
label_col = 'Winner'

for col in main_features:
    if col not in df.columns:
        df[col] = 0
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Label Encoding
le_label = LabelEncoder()
df['Winner_enc'] = le_label.fit_transform(df[label_col].astype(str))
le_gp = LabelEncoder()
df['GrandPrix_enc'] = le_gp.fit_transform(df['Grand Prix'].astype(str))
le_team = LabelEncoder()
df['Team_enc'] = le_team.fit_transform(df['Car'].astype(str)) # 用Car代表Team

def get_driver_seq_features(df, driver, year, n_seq=3):
    df_hist = df[(df[label_col] == driver) & (df['year'] < year)].sort_values('year', ascending=False)
    if len(df_hist) >= n_seq:
        feats = df_hist.head(n_seq)[main_features].values
    else:
        feats = np.vstack([df_hist[main_features].values, np.zeros((n_seq-len(df_hist), len(main_features)))])
    return feats

X_seq, X_static, y = [], [], []
n_seq = 3
for i, row in df.iterrows():
    X_seq.append(get_driver_seq_features(df, row[label_col], row['year'], n_seq))
    # 靜態特徵為 [year, GrandPrix編碼, Team編碼]
    X_static.append([row['year'], row['GrandPrix_enc'], row['Team_enc']])
    y.append(row['Winner_enc'])
X_seq = np.stack(X_seq)
X_static = np.array(X_static)
y = np.array(y)

vc = pd.Series(y).value_counts()
valid_labels = vc[vc > 1].index.tolist()
mask = [yy in valid_labels for yy in y]
X_seq_valid = X_seq[mask]
X_static_valid = X_static[mask]
y_valid_raw = y[mask]
le_valid = LabelEncoder()
y_valid = le_valid.fit_transform(y_valid_raw)

class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_size, hidden_size=16, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return h_n[-1]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lstm = LSTMFeatureExtractor(input_size=len(main_features)).to(device)
with torch.no_grad():
    lstm_features = lstm(torch.tensor(X_seq_valid, dtype=torch.float32).to(device)).cpu().numpy()

X_all_valid = np.hstack([lstm_features, X_static_valid])
X_train, X_test, y_train, y_test = train_test_split(
    X_all_valid, y_valid, test_size=0.2, random_state=42, stratify=y_valid
)

model = xgb.XGBClassifier(
    n_estimators=120, learning_rate=0.1, max_depth=4,
    objective='multi:softprob', num_class=len(le_valid.classes_)
)
model.fit(X_train, y_train)

mean_laps = int(df['Laps'].mean())
mean_time = int(df['Time'].mean())
mean_position = int(df['Position'].mean()) if df['Position'].mean() > 0 else 1

def predict_and_analysis(driver_name, year, grid, grand_prix, team):
    # 歷史三場特徵
    feats_seq = get_driver_seq_features(df, driver_name, int(year), n_seq)
    feats_seq[-1] = [grid, mean_laps, mean_time, mean_position]
    X_seq_input = torch.tensor([feats_seq], dtype=torch.float32).to(device)
    with torch.no_grad():
        lstm_feat = lstm(X_seq_input).cpu().numpy()
    # Grand Prix、Team 編碼
    gp_enc = le_gp.transform([grand_prix])[0]
    team_enc = le_team.transform([team])[0]
    X_static_input = np.array([[year, gp_enc, team_enc]])
    X_input = np.hstack([lstm_feat, X_static_input])
    probs = model.predict_proba(X_input)
    top3 = np.argsort(probs[0])[::-1][:3]
    orig_names = [le_label.inverse_transform([int(le_valid.classes_[i])])[0] for i in top3]
    probs_top = probs[0][top3]
    txt = "\n".join([f"{orig_names[i]}: 機率 {probs_top[i]:.2%}" for i in range(3)])
    return f"預測TOP3：\n{txt}"

with gr.Blocks() as demo:
    gr.Markdown("# F1 冠軍預測（LSTM + XGBoost）")
    with gr.Row():
        driver = gr.Dropdown(list(le_label.classes_), label="車手")
        year = gr.Number(label="年份", value=2024)
        grid = gr.Number(label="起跑位", value=1)
        grand_prix = gr.Dropdown(list(le_gp.classes_), label="Grand Prix")
        team = gr.Dropdown(list(le_team.classes_), label="Team")
    btn = gr.Button("預測")
    output = gr.Textbox(label="預測結果")
    btn.click(
        predict_and_analysis,
        [driver, year, grid, grand_prix, team],
        output
    )

    def show_analysis():
        idx = np.random.choice(len(X_test), 10, replace=False)
        Xsub = X_test[idx]
        ysub = y_test[idx]
        pred_sub = model.predict(Xsub)
        orig = [le_label.inverse_transform([int(le_valid.classes_[yy])])[0] for yy in ysub]
        pred = [le_label.inverse_transform([int(le_valid.classes_[yy])])[0] for yy in pred_sub]
        txt = "\n".join([f"第{i+1}筆：預測 {p} | 真實 {t}" for i, (p, t) in enumerate(zip(pred, orig))])
        return txt

    with gr.Row():
        gr.Markdown("## 驗證集 10 筆：預測 vs. 真實")
        analysis_btn = gr.Button("隨機取10筆驗證")
        analysis_out = gr.Textbox()
        analysis_btn.click(show_analysis, [], analysis_out)

demo.launch()


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




  X_seq_input = torch.tensor([feats_seq], dtype=torch.float32).to(device)
