In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import gradio as gr
import random

# 資料讀取
winners = pd.read_csv('./data/winners.csv')
drivers = pd.read_csv('./data/drivers_updated.csv')
laps = pd.read_csv('./data/fastest_laps_updated.csv')
teams = pd.read_csv('./data/teams_updated.csv')
winners['year'] = pd.to_datetime(winners['Date']).dt.year

# 時間轉換函數
def time_to_seconds(t):
    if pd.isna(t): return 0
    t = str(t)
    if ':' in t:
        parts = t.split(':')
        if len(parts) == 2:
            m, s = parts
            return int(m)*60 + float(s)
        elif len(parts) == 3:
            h, m, s = parts
            return int(h)*3600 + int(m)*60 + float(s)
    try:
        return float(t)
    except:
        return 0
laps['Time_sec'] = laps['Time'].apply(time_to_seconds)

# 特徵統計
driver_gp_win_count = winners.groupby(['Winner', 'Grand Prix']).size().unstack(fill_value=0)
team_gp_win_count = winners.groupby(['Car', 'Grand Prix']).size().unstack(fill_value=0)
driver_gp_best_lap = laps.groupby(['Driver', 'Grand Prix'])['Time_sec'].min().unstack(fill_value=0)
driver_year_pts = drivers.groupby(['Driver', 'year'])['PTS'].max().unstack(fill_value=0)
team_year_pts = teams.groupby(['Team', 'year'])['PTS'].max().unstack(fill_value=0)
driver_total_win = winners.groupby('Winner').size()
team_total_win = winners.groupby('Car').size()

# 編碼器
le_gp = LabelEncoder()
le_gp.fit(winners['Grand Prix'])
le_driver = LabelEncoder()
le_driver.fit(drivers['Driver'])
le_team = LabelEncoder()
le_team.fit(teams['Team'])

# 車隊與車手列表
all_teams_list = sorted(teams['Team'].dropna().unique().tolist())
all_drivers_list = sorted(drivers['Driver'].dropna().unique().tolist())
all_years = sorted(winners['year'].unique().tolist())
all_grand_prix = sorted(winners['Grand Prix'].unique().tolist())

# 建立訓練資料與標籤，並同時儲存車手名稱供後續使用
feature_rows = []
labels = []
group_sizes = []
driver_names_list = []

for (year, gp), group in winners.groupby(['year', 'Grand Prix']):
    drivers_info = []
    labels_group = []
    for _, row in group.iterrows():
        driver = row['Winner']
        team = row['Car']

        d_gp_win = driver_gp_win_count.loc[driver, gp] if (driver in driver_gp_win_count.index and gp in driver_gp_win_count.columns) else 0
        t_gp_win = team_gp_win_count.loc[team, gp] if (team in team_gp_win_count.index and gp in team_gp_win_count.columns) else 0
        best_lap = driver_gp_best_lap.loc[driver, gp] if (driver in driver_gp_best_lap.index and gp in driver_gp_best_lap.columns) else 0
        pts = driver_year_pts.loc[driver, year] if (driver in driver_year_pts.index and year in driver_year_pts.columns) else 0
        pts_team = team_year_pts.loc[team, year] if (team in team_year_pts.index and year in team_year_pts.columns) else 0
        d_total_win = driver_total_win.loc[driver] if driver in driver_total_win.index else 0
        t_total_win = team_total_win.loc[team] if team in team_total_win.index else 0

        drivers_info.append((driver, team, d_gp_win, t_gp_win, best_lap, pts, pts_team, d_total_win, t_total_win))
        labels_group.append(1)

    # 負樣本：同年該場其他車手
    others = drivers[(drivers['year']==year) & (~drivers['Driver'].isin([d[0] for d in drivers_info]))]
    for _, other in others.iterrows():
        driver = other['Driver']
        team = other['Car']

        d_gp_win = driver_gp_win_count.loc[driver, gp] if (driver in driver_gp_win_count.index and gp in driver_gp_win_count.columns) else 0
        t_gp_win = team_gp_win_count.loc[team, gp] if (team in team_gp_win_count.index and gp in team_gp_win_count.columns) else 0
        best_lap = driver_gp_best_lap.loc[driver, gp] if (driver in driver_gp_best_lap.index and gp in driver_gp_best_lap.columns) else 0
        pts = driver_year_pts.loc[driver, year] if (driver in driver_year_pts.index and year in driver_year_pts.columns) else 0
        pts_team = team_year_pts.loc[team, year] if (team in team_year_pts.index and year in team_year_pts.columns) else 0
        d_total_win = driver_total_win.loc[driver] if driver in driver_total_win.index else 0
        t_total_win = team_total_win.loc[team] if team in team_total_win.index else 0

        drivers_info.append((driver, team, d_gp_win, t_gp_win, best_lap, pts, pts_team, d_total_win, t_total_win))
        labels_group.append(0)

    for d in drivers_info:
        year_code = year
        gp_code = le_gp.transform([gp])[0]
        driver_code = le_driver.transform([d[0]])[0] if d[0] in le_driver.classes_ else -1
        team_code = le_team.transform([d[1]])[0] if d[1] in le_team.classes_ else -1

        feature_rows.append([
            year_code, gp_code, driver_code, team_code,
            d[2], d[3], d[4], d[5], d[6], d[7], d[8]
        ])
        driver_names_list.append(d[0])
    labels.extend(labels_group)
    group_sizes.append(len(drivers_info))

df_feat = pd.DataFrame(feature_rows, columns=[
    'year', 'gp_code', 'driver_code', 'team_code',
    'driver_gp_win', 'team_gp_win', 'best_lap', 'pts', 'pts_team', 'driver_total_win', 'team_total_win'
])
df_feat['Driver'] = driver_names_list

X = df_feat.drop(columns=['Driver']).values
y = np.array(labels)
group_sizes = np.array(group_sizes)

# 分割資料集
indices = np.arange(len(group_sizes))
np.random.seed(42)
np.random.shuffle(indices)
split_idx = int(len(group_sizes)*0.8)
train_idx = indices[:split_idx]
test_idx = indices[split_idx:]

def subset_by_group_idx(X, y, group_sizes, idxs):
    group_bounds = np.cumsum(group_sizes)
    X_list, y_list = [], []
    start = 0
    for i, end in enumerate(group_bounds):
        if i in idxs:
            X_list.append(X[start:end])
            y_list.append(y[start:end])
        start = end
    return np.vstack(X_list), np.hstack(y_list), np.array([group_sizes[i] for i in idxs])

X_train, y_train, train_groups = subset_by_group_idx(X, y, group_sizes, train_idx)
X_test, y_test, test_groups = subset_by_group_idx(X, y, group_sizes, test_idx)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(train_groups)
dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(test_groups)

params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'map',
    'eta': 0.1,
    'max_depth': 6,
    'verbosity': 1
}
model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train'), (dtest, 'test')])

def random_sample_predictions(model, X, y, group_sizes, df_feat, le_driver, le_gp, sample_num=10):
    group_bounds = np.cumsum(group_sizes)
    n_groups = len(group_sizes)
    sampled_idx = random.sample(range(n_groups), sample_num)
    start_idx = 0
    results = []
    for i, end_idx in enumerate(group_bounds):
        if i in sampled_idx:
            X_group = X[start_idx:end_idx]
            dgroup = xgb.DMatrix(X_group)
            preds = model.predict(dgroup)
            exp_preds = np.exp(preds - np.max(preds))
            probs = exp_preds / exp_preds.sum()
            group_data = df_feat.iloc[start_idx:end_idx]
            drivers = group_data['Driver'].values
            grand_prix_code = group_data['gp_code'].iloc[0]
            grand_prix_name = le_gp.inverse_transform([grand_prix_code])[0]
            max_idx = np.argmax(probs)
            pred_winner = drivers[max_idx]

            true_idx = np.where(y[start_idx:end_idx]==1)[0]
            if len(true_idx) > 0:
                true_winner = drivers[true_idx[0]]
            else:
                true_winner = "Unknown"

            results.append({
                'year': group_data['year'].iloc[0],
                'Grand Prix': grand_prix_name,
                'Predicted Winner': pred_winner,
                'True Winner': true_winner,
                'Probability': probs[max_idx]
            })
        start_idx = end_idx
    return results

def print_sampled_results():
    results = random_sample_predictions(model, X_test, y_test, test_groups, df_feat, le_driver, le_gp, sample_num=10)
    out = ""
    for res in results:
        out += f"{res['year']} {res['Grand Prix']}\n"
        out += f"Predicted Winner: {res['Predicted Winner']} , True Winner: {res['True Winner']} , Probability: {res['Probability']:.4f}\n\n"
    return out

def predict_prob(year, grand_prix, *inputs):
    drivers = []
    teams = []
    for i in range(20):
        drivers.append(inputs[i*2])
        teams.append(inputs[i*2+1])
    feats = []
    for d, t in zip(drivers, teams):
        d_gp_win = driver_gp_win_count.get(grand_prix, {}).get(d, 0) if grand_prix in driver_gp_win_count.columns else 0
        t_gp_win = team_gp_win_count.get(grand_prix, {}).get(t, 0) if grand_prix in team_gp_win_count.columns else 0
        best_lap = driver_gp_best_lap.get(d, {}).get(grand_prix, 0) if grand_prix in driver_gp_best_lap.columns else 0
        pts = driver_year_pts.get(d, {}).get(year, 0) if year in driver_year_pts.columns else 0
        pts_team = team_year_pts.get(t, {}).get(year, 0) if year in team_year_pts.columns else 0
        d_total_win = driver_total_win.get(d, 0)
        t_total_win = team_total_win.get(t, 0)
        year_code = year
        gp_code = le_gp.transform([grand_prix])[0] if grand_prix in le_gp.classes_ else 0
        driver_code = le_driver.transform([d])[0] if d in le_driver.classes_ else -1
        team_code = le_team.transform([t])[0] if t in le_team.classes_ else -1
        feats.append([
            year_code, gp_code, driver_code, team_code,
            d_gp_win, t_gp_win, best_lap, pts, pts_team, d_total_win, t_total_win
        ])
    feats = np.array(feats)
    dmatrix = xgb.DMatrix(feats)
    preds = model.predict(dmatrix)
    exp_preds = np.exp(preds - np.max(preds))
    probs = exp_preds / exp_preds.sum()
    return sorted(zip(drivers, probs), key=lambda x: x[1], reverse=True)

with gr.Blocks() as demo:
    gr.Markdown("# F1 冠軍預測多樣性學習")
    with gr.Row():
        year_input = gr.Dropdown(all_years, value=all_years[-1], label="年份")
        grand_prix_input = gr.Dropdown(all_grand_prix, value=all_grand_prix[-1], label="分站")
    driver_dropdowns, team_dropdowns = [], []
    for i in range(20):
        with gr.Row():
            driver_dd = gr.Dropdown(all_drivers_list, label=f"車手{i+1}")
            team_dd = gr.Dropdown(all_teams_list, label=f"車隊{i+1}")
            driver_dropdowns.append(driver_dd)
            team_dropdowns.append(team_dd)
    out_table = gr.Dataframe(headers=["車手", "奪冠機率"], type="array")
    result_box = gr.Textbox(label="隨機10場測試預測結果", lines=15)
    inputs_list = [year_input, grand_prix_input]
    for i in range(20):
        inputs_list += [driver_dropdowns[i], team_dropdowns[i]]
    gr.Button("預測冠軍機率").click(predict_prob, inputs=inputs_list, outputs=out_table)
    gr.Button("列印隨機10場測試預測").click(print_sampled_results, outputs=result_box)
demo.launch()


[0]	train-map:0.79405	test-map:0.75961
[1]	train-map:0.80674	test-map:0.79269
[2]	train-map:0.82029	test-map:0.79219
[3]	train-map:0.81647	test-map:0.78003
[4]	train-map:0.82229	test-map:0.78626
[5]	train-map:0.82697	test-map:0.78634
[6]	train-map:0.82020	test-map:0.79298
[7]	train-map:0.83038	test-map:0.80364
[8]	train-map:0.83823	test-map:0.80023
[9]	train-map:0.83845	test-map:0.80773
[10]	train-map:0.83907	test-map:0.81100
[11]	train-map:0.83986	test-map:0.81415
[12]	train-map:0.84178	test-map:0.80822
[13]	train-map:0.84639	test-map:0.80372
[14]	train-map:0.84730	test-map:0.81047
[15]	train-map:0.84762	test-map:0.81460
[16]	train-map:0.84884	test-map:0.81126
[17]	train-map:0.84951	test-map:0.81152
[18]	train-map:0.85342	test-map:0.81212
[19]	train-map:0.85285	test-map:0.80935
[20]	train-map:0.85191	test-map:0.80785
[21]	train-map:0.85361	test-map:0.80785
[22]	train-map:0.85398	test-map:0.81250
[23]	train-map:0.85587	test-map:0.82038
[24]	train-map:0.85634	test-map:0.81588
[25]	train

