# 2024 1 place solution
Silver Based Predictions

In [2]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import seaborn as sns
import matplotlib.pyplot as plt 
from scipy.stats import linregress
from tqdm import tqdm

sys.path.append(os.path.abspath("/workspace"))
with initialize_config_module(version_base=None, config_module="conf"):
    cfg = compose(
        config_name="base.yaml",
        overrides=[
            "exp=exp002" #defaultexp001->exp002のオーバーライド
        ],
    )
    
    OmegaConf.resolve(cfg)
    print("Omega conf is below")
    print(OmegaConf.to_yaml(cfg))
    
"""
#こんな感じでkeyとval取得できる
print(cfg.dir.keys())
print(cfg.exp)"""

INPUT_PATH = cfg.dir.input_dir
OUTPUT_PATH = cfg.dir.output_dir
SRC_PATH = cfg.dir.src_path
CONF_PATH = cfg.dir.conf_path

sys.path.append(SRC_PATH)

Omega conf is below
dir:
  input_dir: ../input
  output_dir: ../output/2025-02-15_20-29-48
  src_path: ../src
  conf_path: ../conf
model:
  epochs: 10
  lr: 0.001
exp:
  model:
    hidden_size: 100
    learning_rate: 0.1
    batch_size: 640
    num_epochs: 50



In [5]:
files = os.listdir(INPUT_PATH)
df_names = [f.replace(".csv", "") for f in files if f.endswith(".csv")]

dfs = {}
for name in df_names:
    filepath = os.path.join(INPUT_PATH, name + ".csv")
    dfs[name] = pd.read_csv(filepath, low_memory=False, encoding="latin-1")

"""msilver = pd.read_csv("/kaggle/input/msilver/msilver.csv")
msilver = msilver.iloc[:, [0, 13]]
msilver.columns = ["TeamName", "Power"]
wsilver = pd.read_csv("/kaggle/input/wsilver/wsilver.csv")
wsilver = wsilver.iloc[:, [0, 10]]
wsilver.columns = ["TeamName", "Power"]

print(msilver.head())
print(wsilver.head())"""


tourney_seeds_with_ratings = pd.DataFrame({
    "Tournament": ["W", "W", "M", "M", "M"],
    "TeamName"   : ["South Carolina", "north carolina", "uconn", "Houston", "conn. something"],
    "TeamID"     : [3376, 3399, 1163, 1200, 1163],
    "Power"      : [150, 120, 130, 135, 128]
})


filtered_df_w = tourney_seeds_with_ratings[
    (tourney_seeds_with_ratings["Tournament"] == "W") &
    (tourney_seeds_with_ratings["TeamName"].str.contains("carolina", case=False))
]
print(filtered_df_w)

filtered_df_m = tourney_seeds_with_ratings[
    (tourney_seeds_with_ratings["Tournament"] == "M") &
    (tourney_seeds_with_ratings["TeamName"].str.contains("conn", case=False))
]
print(filtered_df_m)


tourney_seeds_with_ratings.loc[
    tourney_seeds_with_ratings["TeamID"].isin([3376, 1163]), 
    "Power"
] = 200

print(tourney_seeds_with_ratings)


  Tournament        TeamName  TeamID  Power
0          W  South Carolina    3376    150
1          W  north carolina    3399    120
  Tournament         TeamName  TeamID  Power
2          M            uconn    1163    130
4          M  conn. something    1163    128
  Tournament         TeamName  TeamID  Power
0          W   South Carolina    3376    200
1          W   north carolina    3399    120
2          M            uconn    1163    200
3          M          Houston    1200    135
4          M  conn. something    1163    200


In [None]:
import numpy as np
import pandas as pd
import math
from math import erf, sqrt

# Φ(x)
def pnorm(x):
    return 0.5 * (1 + erf(x / math.sqrt(2)))


# --- msilver_wpct 関数 ---
def msilver_wpct(pwr1, pwr2):
    pred_pt_margin = (pwr1 - pwr2)
    tscore = pred_pt_margin / 11.0
    return pnorm(tscore)


# --- wsilver_wpct 関数 ---
def wsilver_wpct(pwr1, pwr2, home=0):
    # home = 1 (home), 0 (neutral), -1 (away)
    hfa = 2.73 * home
    tscore = (pwr1 - pwr2 + hfa) / 11.5
    return pnorm(tscore)


# --- wpct_function 関数 ---
def wpct_function(bracket, pwr1, pwr2, home=0):
    if bracket == "M":
        return msilver_wpct(pwr1, pwr2)
    else:
        return wsilver_wpct(pwr1, pwr2, home)

# -------------------------------------------------------
# ここから下はデータフレームを用いた処理 (Rの dplyr 相当)
# tourney_seeds, sample_submission, tourney_seeds_with_ratings
# が既に存在していると仮定した例です
# -------------------------------------------------------

# -- R: tourney_seeds %>% mutate(team_region = substr(Seed, 1, 1),
#                                team_rank = as.numeric(substr(Seed, 2, 3)))
tourney_seeds["team_region"] = tourney_seeds["Seed"].str[:1]  # 先頭1文字
tourney_seeds["team_rank"] = pd.to_numeric(tourney_seeds["Seed"].str[1:3])  # 2〜3文字目を数値化

# -- R: 
# sample_submission %>%
#   mutate(Round = substr(Slot, 2, 2),
#          Region = ifelse(Round <= 4, substr(Slot, 3, 3), substr(Slot, 3, 4)),
#          Highest_Rank = substr(Slot, 4, 4))
games_to_predict = sample_submission.copy()

# Rの `Round = substr(Slot, 2, 2)` は「Slotの2文字目」
# Pythonの文字列は0始まりなので、str[1:2] で2文字目を抜き出す
games_to_predict["Round"] = games_to_predict["Slot"].str[1:2]
# Roundを文字列→数値に変換
games_to_predict["Round"] = pd.to_numeric(games_to_predict["Round"])

def calc_region(row):
    # Rの ifelse(Round <= 4, substr(Slot, 3, 3), substr(Slot, 3, 4))
    # Round <= 4 なら 3文字目, それを超えるなら 3～4文字目
    if row["Round"] <= 4:
        return row["Slot"][2:3]
    else:
        return row["Slot"][2:4]

games_to_predict["Region"] = games_to_predict.apply(calc_region, axis=1)
games_to_predict["Highest_Rank"] = games_to_predict["Slot"].str[3:4]  # 4文字目

# -- R: full_join(games_to_predict, tourney_seeds, by=c("Tournament"))
# Pythonでは how="outer" の pd.merge
games_to_predict_with_teams = pd.merge(
    games_to_predict,
    tourney_seeds,
    on="Tournament",
    how="outer"  # Rの full_join 相当
)

# -------------------------------------------------------
# フィルタリング処理: regionベース
# -------------------------------------------------------
# R:
#  games_to_predict_filtered_by_region =
#    games_to_predict_with_teams %>%
#      filter(
#         (Round <= 4 & Region == team_region) |
#         (Region == "WX" & team_region %in% c("W", "X")) |
#         (Region == "YZ" & team_region %in% c("Y", "Z")) |
#         (Region == "CH")
#      )
cond1 = (games_to_predict_with_teams["Round"] <= 4) & \
        (games_to_predict_with_teams["Region"] == games_to_predict_with_teams["team_region"])

cond2 = (games_to_predict_with_teams["Region"] == "WX") & \
        (games_to_predict_with_teams["team_region"].isin(["W", "X"]))

cond3 = (games_to_predict_with_teams["Region"] == "YZ") & \
        (games_to_predict_with_teams["team_region"].isin(["Y", "Z"]))

cond4 = (games_to_predict_with_teams["Region"] == "CH")

games_to_predict_filtered_by_region = games_to_predict_with_teams[
    cond1 | cond2 | cond3 | cond4
].copy()

# -------------------------------------------------------
# フィルタリング処理: slotベース
# -------------------------------------------------------
# R:
#   mutate(first_round_game = pmin(team_rank, 17-team_rank),
#          second_round_game = pmin(first_round_game, 9-first_round_game),
#          third_round_game = pmin(second_round_game, 5-second_round_game)) %>%
#   filter(
#     Round >= 4 |
#     (Round == 1 & Highest_Rank == first_round_game) |
#     (Round == 2 & Highest_Rank == second_round_game) |
#     (Round == 3 & Highest_Rank == third_round_game)
#   )

df_slot = games_to_predict_filtered_by_region.copy()

df_slot["first_round_game"] = df_slot.apply(
    lambda row: min(row["team_rank"], 17 - row["team_rank"]), axis=1
)
df_slot["second_round_game"] = df_slot.apply(
    lambda row: min(row["first_round_game"], 9 - row["first_round_game"]), axis=1
)
df_slot["third_round_game"] = df_slot.apply(
    lambda row: min(row["second_round_game"], 5 - row["second_round_game"]), axis=1
)

cond_round_ge4 = (df_slot["Round"] >= 4)
cond_round1 = (df_slot["Round"] == 1) & (df_slot["Highest_Rank"].astype(int) == df_slot["first_round_game"])
cond_round2 = (df_slot["Round"] == 2) & (df_slot["Highest_Rank"].astype(int) == df_slot["second_round_game"])
cond_round3 = (df_slot["Round"] == 3) & (df_slot["Highest_Rank"].astype(int) == df_slot["third_round_game"])

games_to_predict_filtered_by_region_slot = df_slot[
    cond_round_ge4 | cond_round1 | cond_round2 | cond_round3
].copy()

# -------------------------------------------------------
# adding in HFA for women's bracket
# -------------------------------------------------------
# R:
#   mutate(at_home = ifelse(Round <= 2 & Tournament == "W" & team_rank <= 4, "yes", "no"))
def check_at_home(row):
    if row["Round"] <= 2 and row["Tournament"] == "W" and row["team_rank"] <= 4:
        return "yes"
    else:
        return "no"

games_to_predict_with_possible_winners = games_to_predict_filtered_by_region_slot.copy()
games_to_predict_with_possible_winners["at_home"] = games_to_predict_with_possible_winners.apply(
    check_at_home, axis=1
)

# -------------------------------------------------------
# 最後に、各チームのratingsを結合
# R:
#   left_join(..., tourney_seeds_with_ratings %>% select(TeamID, Power), by="TeamID")
# -------------------------------------------------------
games_to_predict_with_possible_winners = pd.merge(
    games_to_predict_with_possible_winners,
    tourney_seeds_with_ratings[["TeamID", "Power"]],
    on="TeamID",
    how="left"
)

# これで Rコードに相当する処理は一通り完了です


In [None]:
import numpy as np
import pandas as pd
import random

# -------------------------
# Rコードで定義されていた wpct_function
# ここでは例として以下のように実装
#   bracket=="M" -> msilver_wpct
#   bracket=="W" -> wsilver_wpct
# さらに msilver_wpct, wsilver_wpct は別途定義済みと仮定
# -------------------------

def msilver_wpct(pwr1, pwr2):
    pred_pt_margin = (pwr1 - pwr2)
    tscore = pred_pt_margin / 11.0
    # 通常は scipy.stats.norm.cdf(tscore) など
    return 0.5 * (1 + np.math.erf(tscore / np.sqrt(2)))

def wsilver_wpct(pwr1, pwr2, home=0):
    # home = 1 (home), 0 (neutral), -1 (away)
    hfa = 2.73 * home
    tscore = (pwr1 - pwr2 + hfa) / 11.5
    return 0.5 * (1 + np.math.erf(tscore / np.sqrt(2)))

def wpct_function(bracket, pwr1, pwr2, home=0):
    if bracket == "M":
        return msilver_wpct(pwr1, pwr2)
    else:
        return wsilver_wpct(pwr1, pwr2, home)


# ------------------------------------------------
# 実際のシミュレーション部分 (Rの forループ翻訳)
# ------------------------------------------------

number_of_brackets = 5000

# シミュレーション結果を蓄積するDataFrame
sim_brackets = pd.DataFrame(columns=["Tournament","Slot","Team","Bracket"])

for bracket_num in range(1, number_of_brackets+1):
    # まず Round=1 に出場するチームを抽出
    # R:  filter(Round == 1) %>% pull(TeamID)
    possible_winners = (
        games_to_predict_with_possible_winners
        .query("Round == 1")["TeamID"]
        .tolist()
    )

    # 各ラウンドの対戦結果を蓄積
    predicted_games = pd.DataFrame(columns=["Tournament","Slot","Team"])

    # ラウンドは最大6回 (Rの for(rnd in 1:6))
    for rnd in range(1, 7):
        # 今ラウンドに参加するチームだけを抽出
        # R: filter(Round == rnd, TeamID %in% possible_winners)
        round_possible_winners = (
            games_to_predict_with_possible_winners
            .query("Round == @rnd and TeamID in @possible_winners")
            .copy()
        )

        # ここで round_possible_winners をワイド形式にする (TeamID.x, TeamID.y)
        # Rの many-to-many join: left_join(..., by=c("Tournament","Slot"))
        # さらに filter(TeamID.x < TeamID.y)
        round_possible_winners_wide = pd.merge(
            round_possible_winners,
            round_possible_winners,
            on=["Tournament","Slot"],
            how="inner",
            suffixes=(".x", ".y")
        )
        round_possible_winners_wide = round_possible_winners_wide[
            round_possible_winners_wide["TeamID.x"] < round_possible_winners_wide["TeamID.y"]
        ].copy()

        # homeの判定: Rの case_when
        #  at_home.x == "yes" -> 1
        #  at_home.y == "yes" -> -1
        #  それ以外 -> 0
        def home_value(row):
            if row["at_home.x"] == "yes":
                return 1
            elif row["at_home.y"] == "yes":
                return -1
            else:
                return 0

        round_possible_winners_wide["home"] = round_possible_winners_wide.apply(home_value, axis=1)

        # wpctの計算
        # R: mutate(wpct = wpct_function(Tournament, Power.x, Power.y, home))
        def calc_wpct(row):
            return wpct_function(row["Tournament"], row["Power.x"], row["Power.y"], row["home"])

        round_possible_winners_wide["wpct"] = round_possible_winners_wide.apply(calc_wpct, axis=1)

        # シミュレーションごとに勝者をランダムに決定
        winners = []
        for i in range(len(round_possible_winners_wide)):
            row = round_possible_winners_wide.iloc[i]
            team_x = row["TeamID.x"]
            team_y = row["TeamID.y"]
            p_x = row["wpct"]
            p_y = 1.0 - p_x

            # Rの sample(x=c(TeamID.x, TeamID.y), size=1, prob=c(wpct, 1-wpct))
            # Pythonなら random.choices か np.random.choice
            winner = random.choices([team_x, team_y], weights=[p_x, p_y], k=1)[0]
            winners.append(winner)

        # 次ラウンド用に possible_winners をアップデート
        possible_winners = winners

        # 勝者のSeedを取り出して、"Team"として紐づける
        # R: left_join(data.frame(TeamID=winners), tourney_seeds, by="TeamID") %>% pull(Seed)
        winners_df = pd.DataFrame({"TeamID": winners})
        tmp = pd.merge(
            winners_df,
            tourney_seeds[["TeamID","Seed"]],
            on="TeamID",
            how="left"
        )
        winning_seeds = tmp["Seed"].tolist()

        # R: round_possible_winners_wide$Team = winning_seeds
        round_possible_winners_wide["Team"] = winning_seeds

        # R: new_predicted_games = select(Tournament, Slot, Team)
        new_predicted_games = round_possible_winners_wide[["Tournament","Slot","Team"]].copy()

        # R: predicted_games = rbind(predicted_games, new_predicted_games)
        predicted_games = pd.concat([predicted_games, new_predicted_games], ignore_index=True)

    # シミュレーション番号を割り当て
    new_sim_bracket = predicted_games.copy()
    new_sim_bracket["Bracket"] = bracket_num

    # 最終的に sim_brackets へ統合
    sim_brackets = pd.concat([sim_brackets, new_sim_bracket], ignore_index=True)

# 経過時間の計測は省略（Rでは Sys.time()-t としていた部分）

# ------------------------------------------------
# CSV出力 (Rの write.csv())
# ------------------------------------------------
sim_brackets["RowId"] = range(1, len(sim_brackets)+1)
sim_brackets_output = sim_brackets[["RowId","Tournament","Bracket","Slot","Team"]].copy()
sim_brackets_output.to_csv("submission.csv", index=False)

# ------------------------------------------------
# 集計例 (Rの dplyr / group_by ~ summarize 相当)
# ------------------------------------------------
# W の R6CHスロットで集計
sim_brackets_w_R6CH = sim_brackets.query("Tournament == 'W' and Slot == 'R6CH'")
count_w = sim_brackets_w_R6CH.groupby("Team").size().reset_index(name="n")
count_w = count_w.sort_values("n", ascending=False)
print("W R6CH top teams:")
print(count_w.head())

# WomensのPower上位(参照のみ)
tourney_seeds_with_ratings_w = tourney_seeds_with_ratings.query("Tournament == 'W'")
tourney_seeds_with_ratings_w = tourney_seeds_with_ratings_w.sort_values("Power", ascending=False)
print("Top 5 Power (W):")
print(tourney_seeds_with_ratings_w.head())

# M の R6CHスロットで集計
sim_brackets_m_R6CH = sim_brackets.query("Tournament == 'M' and Slot == 'R6CH'")
count_m = sim_brackets_m_R6CH.groupby("Team").size().reset_index(name="n")
count_m = count_m.sort_values("n", ascending=False)
print("M R6CH top teams:")
print(count_m.head())

# MensのPower上位(参照のみ)
tourney_seeds_with_ratings_m = tourney_seeds_with_ratings.query("Tournament == 'M'")
tourney_seeds_with_ratings_m = tourney_seeds_with_ratings_m.sort_values("Power", ascending=False)
print("Top 5 Power (M):")
print(tourney_seeds_with_ratings_m.head())
