In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
import torch
import torch.nn as nn
import gradio as gr
from collections import Counter

# ===== 1. 資料合併與欄位處理 =====
winners = pd.read_csv('./data/winners.csv')
drivers = pd.read_csv('./data/drivers_updated.csv')
laps = pd.read_csv('./data/fastest_laps_updated.csv')
teams = pd.read_csv('./data/teams_updated.csv')
winners['year'] = pd.to_datetime(winners['Date']).dt.year

df = winners.merge(
    drivers, left_on=['Winner', 'Car', 'year'], right_on=['Driver', 'Car', 'year'], how='left'
).merge(
    laps, left_on=['Grand Prix', 'Winner', 'Car', 'year'], right_on=['Grand Prix', 'Driver', 'Car', 'year'], how='left', suffixes=('', '_lap')
).merge(
    teams, left_on=['Car', 'year'], right_on=['Team', 'year'], how='left', suffixes=('', '_team')
)
for col in ['Nationality']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

def time_to_seconds(t):
    if pd.isna(t): return 0
    parts = str(t).replace('.', ':').split(':')
    if len(parts) == 3:
        m, s, ms = parts
        return int(m)*60 + int(s) + int(ms)/1000
    else: return 0
df['Time_lap'] = df['Time_lap'].apply(time_to_seconds)

# Label encode Grand Prix
le_gp = LabelEncoder()
df['Grand_Prix_code'] = le_gp.fit_transform(df['Grand Prix'].astype(str))

xgb_features = ['Pos', 'PTS', 'Grand_Prix_code', 'Time_lap', 'Pos_team', 'PTS_team']
if 'Pos_team' not in df: df['Pos_team'] = df['Pos']
if 'PTS_team' not in df: df['PTS_team'] = df['PTS']
for col in xgb_features:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(float)

le_all_drivers = LabelEncoder()
df['Winner_label_full'] = le_all_drivers.fit_transform(df['Winner'])
X_xgb = df[xgb_features]
y_full = df['Winner_label_full']

y_series = pd.Series(y_full)
value_counts = y_series.value_counts()
multi_winner_labels = value_counts[value_counts >= 2].index
mask = y_series.isin(multi_winner_labels)
X_xgb_filtered = X_xgb[mask]
y_filtered_raw = y_series[mask].values

multi_winner_driver_names = le_all_drivers.inverse_transform(multi_winner_labels)
le_y = LabelEncoder()
y_filtered = le_y.fit_transform(y_filtered_raw)

# ===== 2. LSTM特徵 =====
N = 5
seq_features = ['PTS', 'Pos', 'Time_lap']
sequences, sequence_labels_raw = [], []
for driver in multi_winner_driver_names:
    df_driver = df[df['Winner'] == driver].sort_values('year')
    for i in range(N, len(df_driver)):
        seq = df_driver[seq_features].iloc[i-N:i].values.astype(float)
        label_full = df_driver['Winner_label_full'].iloc[i]
        if label_full in multi_winner_labels:
            sequences.append(seq)
            sequence_labels_raw.append(label_full)
X_lstm = np.array(sequences)
sequence_labels = le_y.transform(sequence_labels_raw)
y_lstm = np.array(sequence_labels)
lstm_counts = Counter(y_lstm)
keep_labels = {k for k, v in lstm_counts.items() if v >= 2}
keep_idx = [i for i, label in enumerate(y_lstm) if label in keep_labels]
X_lstm_filtered = X_lstm[keep_idx]
y_lstm_filtered = y_lstm[keep_idx]

# ===== 3. XGBoost/LSTM模型 =====
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(
    X_xgb_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train_xgb, y_train_xgb)

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

input_size = len(seq_features)
hidden_size = 64
num_classes = len(le_y.classes_)
model_lstm = LSTMClassifier(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_lstm.parameters(), lr=1e-3)
if len(X_lstm_filtered) > 0:
    X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
        X_lstm_filtered, y_lstm_filtered, test_size=0.2, random_state=42, stratify=y_lstm_filtered
    )
    X_train_lstm_torch = torch.tensor(X_train_lstm, dtype=torch.float32)
    y_train_lstm_torch = torch.tensor(y_train_lstm, dtype=torch.long)
    for epoch in range(10):
        model_lstm.train()
        optimizer.zero_grad()
        output = model_lstm(X_train_lstm_torch)
        loss = criterion(output, y_train_lstm_torch)
        loss.backward()
        optimizer.step()

# ===== 4. 車手、車隊名單來源 =====
all_years = sorted(df['year'].unique().tolist())
all_grand_prix = sorted(df['Grand Prix'].unique().tolist())
all_drivers_list = sorted(drivers['Driver'].dropna().unique().tolist())
all_teams_list = sorted(teams['Team'].dropna().unique().tolist())

def get_driver_features(driver, team, year, grand_prix, grid_pos):
    # Grand Prix code
    grand_prix_code = le_gp.transform([grand_prix])[0] if grand_prix in le_gp.classes_ else 0
    row = df[
        (df['Driver'] == driver) &
        (df['Car'] == team) &
        (df['year'] == year) &
        (df['Grand Prix'] == grand_prix)
    ]
    if row.shape[0] == 0:
        return [float(grid_pos), 0, grand_prix_code, 0, 0, 0]
    row = row.iloc[0]
    return [
        float(grid_pos),
        float(row['PTS']) if not pd.isna(row['PTS']) else 0,
        grand_prix_code,
        float(row['Time_lap']) if not pd.isna(row['Time_lap']) else 0,
        float(row['Pos_team']) if not pd.isna(row['Pos_team']) else 0,
        float(row['PTS_team']) if not pd.isna(row['PTS_team']) else 0,
    ]

def predict_race_block(year, grand_prix, *driver_inputs):
    results = []
    for i in range(20):
        driver = driver_inputs[i*3]
        team = driver_inputs[i*3+1]
        grid_pos = driver_inputs[i*3+2]
        if not driver:
            continue
        if driver not in multi_winner_driver_names:
            results.append((driver, 0.0))
            continue
        features = get_driver_features(driver, team, year, grand_prix, grid_pos)
        xgb_input = np.array([features])
        prob_xgb = model_xgb.predict_proba(xgb_input)[0]
        lstm_seq = np.zeros((1, N, len(seq_features)), dtype=np.float32)
        prob_lstm = torch.softmax(model_lstm(torch.tensor(lstm_seq)), dim=1).detach().numpy()[0]
        driver_label = le_y.transform([le_all_drivers.transform([driver])[0]])[0]
        final_prob = (prob_xgb[driver_label] + prob_lstm[driver_label]) / 2
        results.append((driver, float(final_prob)))
    return sorted(results, key=lambda x: x[1], reverse=True)

with gr.Blocks() as demo:
    gr.Markdown("# F1 冠軍預測（可自由組合車手、車隊與起跑位）")
    with gr.Row():
        year_input = gr.Dropdown(all_years, value=all_years[-1], label="年份")
        grand_prix_input = gr.Dropdown(all_grand_prix, value=all_grand_prix[-1], label="Grand Prix")
    gr.Markdown("請依序選擇車手、車隊、起跑位（預設前20位車手）")
    driver_dropdowns, team_dropdowns, pos_dropdowns = [], [], []
    for i in range(20):
        with gr.Row():
            driver_dd = gr.Dropdown(all_drivers_list, label=f"車手{i+1}")
            team_dd = gr.Dropdown(all_teams_list, label=f"車隊{i+1}")
            pos_dd = gr.Dropdown(list(range(1, 21)), value=i+1, label=f"排位{i+1}")
            driver_dropdowns.append(driver_dd)
            team_dropdowns.append(team_dd)
            pos_dropdowns.append(pos_dd)
    out_table = gr.Dataframe(headers=["車手", "奪冠機率"], type="array")

    inputs_list = [year_input, grand_prix_input]
    for i in range(20):
        inputs_list += [driver_dropdowns[i], team_dropdowns[i], pos_dropdowns[i]]
    gr.Button("預測冠軍機率").click(
        predict_race_block,
        inputs=inputs_list,
        outputs=out_table
    )
demo.launch()


* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


