In [None]:
#Optuna CatBoost March Mania 2025 [0.15102]
#https://www.kaggle.com/code/alexkalita/optuna-catboost-march-mania-2025-0-15102/notebook#Modeling

# EDA
import numpy as np 
import pandas as pd  
import seaborn as sns 
import matplotlib.pyplot as plt 

#MLモデル、スケジューラ、チューニング、評価関数
import sklearn
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *

#その他
import os
import re
from collections import Counter


In [None]:
DATA_PATH = "./data/"
for filename in sorted(os.listdir(DATA_PATH)):
    print(filename)

Cities.csv
Conferences.csv
MConferenceTourneyGames.csv
MGameCities.csv
MMasseyOrdinals.csv
MNCAATourneyCompactResults.csv
MNCAATourneyDetailedResults.csv
MNCAATourneySeedRoundSlots.csv
MNCAATourneySeeds.csv
MNCAATourneySlots.csv
MRegularSeasonCompactResults.csv
MRegularSeasonDetailedResults.csv
MSeasons.csv
MSecondaryTourneyCompactResults.csv
MSecondaryTourneyTeams.csv
MTeamCoaches.csv
MTeamConferences.csv
MTeamSpellings.csv
MTeams.csv
SampleSubmissionStage1.csv
SampleSubmissionStage2.csv
SeedBenchmarkStage1.csv
WConferenceTourneyGames.csv
WGameCities.csv
WNCAATourneyCompactResults.csv
WNCAATourneyDetailedResults.csv
WNCAATourneySeeds.csv
WNCAATourneySlots.csv
WRegularSeasonCompactResults.csv
WRegularSeasonDetailedResults.csv
WSeasons.csv
WSecondaryTourneyCompactResults.csv
WSecondaryTourneyTeams.csv
WTeamConferences.csv
WTeamSpellings.csv
WTeams.csv


In [48]:
#　seed
#　conncat()でdfを結合できる
df_seeds = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv"),
],ignore_index=True)

df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [49]:
#　season_results
df_season_results = pd.concat([
    pd.read_csv(DATA_PATH + "MRegularSeasonCompactResults.csv"),
    pd.read_csv(DATA_PATH + "WRegularSeasonCompactResults.csv"),
], ignore_index=True)

#　NumOtとWLocを削除
df_season_results.drop(['NumOT', 'WLoc'],axis=1, inplace=True)

#　ScoreGap行を作成
df_season_results['ScoreGap'] = df_season_results['WScore'] - df_season_results['LScore']

df_season_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,ScoreGap
0,1985,20,1228,81,1328,64,17
1,1985,25,1106,77,1354,70,7
2,1985,25,1112,63,1223,56,7
3,1985,25,1165,70,1432,54,16
4,1985,25,1192,86,1447,74,12


In [50]:
# num_win行を作成
# df.groupby()で指定したカラムを元に並べ替えた表を所得
num_win = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()
num_win = num_win[['Season', 'WTeamID', 'DayNum',]].rename(columns={'DayNum': 'NumWins', 'WTeamID': 'TeamID'})                                                            

num_win.head()

Unnamed: 0,Season,TeamID,NumWins
0,1985,1102,5
1,1985,1103,9
2,1985,1104,21
3,1985,1106,10
4,1985,1108,19


In [51]:
num_loss = df_season_results.groupby(['Season','LTeamID']).count().reset_index()
num_loss = num_loss[['Season', 'LTeamID', 'DayNum',]].rename(columns={'DayNum': 'NumLosses', 'LTeamID': 'TeamID'})

num_loss.head()

Unnamed: 0,Season,TeamID,NumLosses
0,1985,1102,19
1,1985,1103,14
2,1985,1104,9
3,1985,1106,14
4,1985,1108,6


In [52]:
#　gap_win行を作成
gap_win = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
gap_win = gap_win[['Season', 'WTeamID', 'ScoreGap',]].rename(columns={'ScoreGap': 'GapWins', 'WTeamID': 'TeamID'})

gap_win.head()

Unnamed: 0,Season,TeamID,GapWins
0,1985,1102,10.0
1,1985,1103,7.555556
2,1985,1104,13.190476
3,1985,1106,9.5
4,1985,1108,13.842105


In [53]:
# gap_loss行を作成
gap_loss = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'ScoreGap']].rename(columns={"ScoreGap": "GapLosses", "LTeamID": "TeamID"})

gap_loss.head()

Unnamed: 0,Season,TeamID,GapLosses
0,1985,1102,9.947368
1,1985,1103,9.857143
2,1985,1104,4.777778
3,1985,1106,13.285714
4,1985,1108,10.666667


In [54]:
# groupby().count()でSeasonとWTeamIDの重複を解消し、SeasonとTeamIDのみのdfを作成
df_features_season_w = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns= {'WTeamID': 'TeamID'})
df_features_season_l = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})

df_features_season_l.head()

Unnamed: 0,Season,TeamID
0,1985,1102
1,1985,1103
2,1985,1104
3,1985,1106
4,1985,1108


In [55]:
df_features_season = pd.concat([df_features_season_w, df_features_season_l], axis=0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

In [56]:
#マージ
df_features_season = df_features_season.merge(num_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(num_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_loss, on=['Season', 'TeamID'], how='left')

#dfの欠損値を埋める
df_features_season.fillna(0, inplace=True)  

df_features_season.head()

Unnamed: 0,Season,TeamID,NumWins,NumLosses,GapWins,GapLosses
0,1985,1102,5.0,19.0,10.0,9.947368
1,1985,1103,9.0,14.0,7.555556,9.857143
2,1985,1104,21.0,9.0,13.190476,4.777778
3,1985,1106,10.0,14.0,9.5,13.285714
4,1985,1108,19.0,6.0,13.842105,10.666667


In [57]:
df_features_season['WinRatio'] = df_features_season['NumWins'] / (df_features_season['NumWins'] + df_features_season['NumLosses'])
df_features_season['GapAvg'] = ((df_features_season['NumWins'] * df_features_season['GapWins'] - df_features_season['NumLosses'] * df_features_season['GapLosses']) / (df_features_season['NumWins'] + df_features_season['NumLosses']))

df_features_season.drop(['NumWins', 'NumLosses', 'GapWins', 'GapLosses'], axis=1, inplace=True)

In [58]:
df_tourney_results = pd.concat([
    pd.read_csv(DATA_PATH + "WNCAATourneyCompactResults.csv"),
    pd.read_csv(DATA_PATH + "MNCAATourneyCompactResults.csv"),
], ignore_index=True)
df_tourney_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

In [59]:
#特徴量エンジニアリング
df = df_tourney_results.copy()
df = df[df['Season'] >= 2016].reset_index(drop=True)

df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
0,2016,137,3107,61,3196,59
1,2016,137,3113,74,3308,52
2,2016,137,3120,68,3385,57
3,2016,137,3124,89,3225,59
4,2016,137,3177,97,3241,67


In [60]:
df = pd.merge(df, df_seeds, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed':'SeedW'})
df = pd.merge(df, df_seeds, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed':'SeedL'})

df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL
0,2016,137,3107,61,3196,59,Z12,Z05
1,2016,137,3113,74,3308,52,Z02,Z15
2,2016,137,3120,68,3385,57,X09,X08
3,2016,137,3124,89,3225,59,X01,X16
4,2016,137,3177,97,3241,67,X06,X11


In [61]:
def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

df['SeedW'] = df['SeedW'].apply(treat_seed)
df['SeedL'] = df['SeedL'].apply(treat_seed)

df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL
0,2016,137,3107,61,3196,59,12,5
1,2016,137,3113,74,3308,52,2,15
2,2016,137,3120,68,3385,57,9,8
3,2016,137,3124,89,3225,59,1,16
4,2016,137,3177,97,3241,67,6,11


In [62]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsW',
    'NumLosses': 'NumLossesW',
    'GapWins': 'GapWinsW',
    'GapLosses': 'GapLossesW',
    'WinRatio': 'WinRatioW',
    'GapAvg': 'GapAvgW',
}).drop(columns='TeamID', axis=1)

In [63]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsL',
    'NumLosses': 'NumLossesL',
    'GapWins': 'GapWinsL',
    'GapLosses': 'GapLossesL',
    'WinRatio': 'WinRatioL',
    'GapAvg': 'GapAvgL',
}).drop(columns='TeamID', axis=1)

In [64]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL,WinRatioW,GapAvgW,WinRatioL,GapAvgL
0,2016,137,3107,61,3196,59,12,5,0.870968,18.677419,0.733333,8.7
1,2016,137,3113,74,3308,52,2,15,0.806452,9.258065,0.857143,8.964286
2,2016,137,3120,68,3385,57,9,8,0.612903,3.032258,0.71875,6.8125
3,2016,137,3124,89,3225,59,1,16,0.970588,24.617647,0.709677,10.612903
4,2016,137,3177,97,3241,67,6,11,0.757576,16.272727,0.84375,15.40625


In [65]:
def add_loosing_matches(df):
    win_rename = {
        'WTeamID': 'TeamIdA',
        'WScore': 'ScoreA',
        'LTeamID': 'TeamIdB',
        'LScore': 'ScoreB'
    }
    win_rename.update({c : c[:-1] + "A" for c in df.columns if c.endswith('W')})
    win_rename.update({c : c[:-1] + "B" for c in df.columns if c.endswith('L')})

    lose_rename = {
        "WTeamID": "TeamIdB", 
        "WScore" : "ScoreB", 
        "LTeamID" : "TeamIdA",
        "LScore": "ScoreA",
    }
    lose_rename.update({c : c[:-1] + "B" for c in df.columns if c.endswith('W')})
    lose_rename.update({c : c[:-1] + "A" for c in df.columns if c.endswith('L')})

    win_df = df.copy()
    lose_df = df.copy()
    
    win_df = win_df.rename(columns=win_rename)
    lose_df = lose_df.rename(columns=lose_rename)
    
    return pd.concat([win_df, lose_df], axis=0, sort=False)

df = add_loosing_matches(df)
df.head()

Unnamed: 0,Season,DayNum,TeamIdA,ScoreA,TeamIdB,ScoreB,SeedA,SeedB,WinRatioA,GapAvgA,WinRatioB,GapAvgB
0,2016,137,3107,61,3196,59,12,5,0.870968,18.677419,0.733333,8.7
1,2016,137,3113,74,3308,52,2,15,0.806452,9.258065,0.857143,8.964286
2,2016,137,3120,68,3385,57,9,8,0.612903,3.032258,0.71875,6.8125
3,2016,137,3124,89,3225,59,1,16,0.970588,24.617647,0.709677,10.612903
4,2016,137,3177,97,3241,67,6,11,0.757576,16.272727,0.84375,15.40625


In [66]:
# 相違点
# 各機能についてチーム間の差を計算する
# これは、チームAがチームBよりどれだけ優れているか(または劣っているのか)を更に評価するのに役立つ

cols_to_diff = [
    'Seed', 'WinRatio', 'GapAvg'
]

for col in cols_to_diff:
    df[col + 'Diff'] = df[col + 'A'] - df[col + 'B']

In [67]:
# テストデータの準備
df_test = pd.read_csv(DATA_PATH + "SampleSubmissionStage1.csv")

df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['TeamIdA'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['TeamIdB'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))

df_test.head()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB
0,2021_1101_1102,0.5,2021,1101,1102
1,2021_1101_1103,0.5,2021,1101,1103
2,2021_1101_1104,0.5,2021,1101,1104
3,2021_1101_1105,0.5,2021,1101,1105
4,2021_1101_1106,0.5,2021,1101,1106


In [81]:
# テストデータの準備(stage2)
df_test_s2 = pd.read_csv(DATA_PATH + "SampleSubmissionStage2.csv")

df_test_s2['Season'] = df_test_s2['ID'].apply(lambda x: int(x.split('_')[0]))
df_test_s2['TeamIdA'] = df_test_s2['ID'].apply(lambda x: int(x.split('_')[1]))
df_test_s2['TeamIdB'] = df_test_s2['ID'].apply(lambda x: int(x.split('_')[2]))

df_test_s2.head()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB
0,2025_1101_1102,0.5,2025,1101,1102
1,2025_1101_1103,0.5,2025,1101,1103
2,2025_1101_1104,0.5,2025,1101,1104
3,2025_1101_1105,0.5,2025,1101,1105
4,2025_1101_1106,0.5,2025,1101,1106


In [68]:
df_test = pd.merge(
    df_test,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'}).fillna('W01')

df_test = pd.merge(
    df_test, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'}).fillna('W01')

df_test['SeedA'] = df_test['SeedA'].apply(treat_seed)
df_test['SeedB'] = df_test['SeedB'].apply(treat_seed)

In [82]:
df_test_s2 = pd.merge(
    df_test_s2,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'}).fillna('W01')

df_test_s2 = pd.merge(
    df_test_s2, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'}).fillna('W01')

df_test_s2['SeedA'] = df_test_s2['SeedA'].apply(treat_seed)
df_test_s2['SeedB'] = df_test_s2['SeedB'].apply(treat_seed)

In [69]:
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsA',
    'NumLosses': 'NumLossesA',
    'GapWins': 'GapWinsA',
    'GapLosses': 'GapLossesA',
    'WinRatio': 'WinRatioA',
    'GapAvg': 'GapAvgA',
}).drop(columns='TeamID', axis=1)

df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsB',
    'NumLosses': 'NumLossesB',
    'GapWins': 'GapWinsB',
    'GapLosses': 'GapLossesB',
    'WinRatio': 'WinRatioB',
    'GapAvg': 'GapAvgB',
}).drop(columns='TeamID', axis=1)

In [83]:
df_test_s2 = pd.merge(
    df_test_s2,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsA',
    'NumLosses': 'NumLossesA',
    'GapWins': 'GapWinsA',
    'GapLosses': 'GapLossesA',
    'WinRatio': 'WinRatioA',
    'GapAvg': 'GapAvgA',
}).drop(columns='TeamID', axis=1)

df_test_s2 = pd.merge(
    df_test_s2,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsB',
    'NumLosses': 'NumLossesB',
    'GapWins': 'GapWinsB',
    'GapLosses': 'GapLossesB',
    'WinRatio': 'WinRatioB',
    'GapAvg': 'GapAvgB',
}).drop(columns='TeamID', axis=1)

In [70]:
for col in cols_to_diff:
    df_test[col + 'Diff'] = df_test[col + 'A'] - df_test[col + 'B']

In [84]:
for col in cols_to_diff:
    df_test_s2[col + 'Diff'] = df_test_s2[col + 'A'] - df_test_s2[col + 'B']

In [71]:
df['ScoreDiff'] = df['ScoreA'] - df['ScoreB']
df['WinA'] = (df['ScoreDiff'] > 0).astype(int)

In [72]:
#モデリング
features = ['SeedA', 'SeedB', 'WinRatioA', 'GapAvgA', 'WinRatioB', 'GapAvgB', 'SeedDiff', 'WinRatioDiff', 'GapAvgDiff']

def rescale(features, df_train, df_val, df_test=None):
    min_ = df_train[features].min()
    max_ = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_) / (max_ - min_ ) 
    df_val[features] = (df_val[features] - min_) / (max_ - min_ ) 
    
    if df_test is not None:
        df_test[features] = (df_test[features] - min_) / (max_ - min_ ) 
        
    return df_train, df_val, df_test



In [73]:
# cross validation
# 過去３シーズンのnシーズンを検証します。

from catboost import CatBoostRegressor
import optuna
from sklearn.metrics import brier_score_loss

def kfold(df, model, df_test_=None, plot=False, verbose=0, mode="reg"):
    seasons = df['Season'].unique()
    cvs = []
    pred_tests = []
    target = "ScoreDiff" if mode == "reg" else "WinA"
    
    for season in seasons[1:]:
        if verbose:
            print(f'\nValidating on season {season}')
        
        df_train = df[df['Season'] < season].reset_index(drop=True).copy()
        df_val = df[df['Season'] == season].reset_index(drop=True).copy()
        df_test = df_test_.copy()
        df_train.fillna(0, inplace=True)  #### add this code for fixing a bug
        df_val.fillna(0, inplace=True)    #### add this code for fixing a bug
        df_test.fillna(0, inplace=True)   #### add this code for fixing a bug
        df_train, df_val, df_test = rescale(features, df_train, df_val, df_test)
        
        if mode == "reg":
            model = ElasticNet(alpha=1, l1_ratio=0.5)
        else:
            model = model

        model.fit(df_train[features], df_train[target])
        
        if mode == "reg":
            pred = model.predict(df_val[features])
        else:
            pred = model.predict(df_val[features])
        
        if df_test is not None:
            if mode == "reg":
                pred_test = model.predict(df_test[features])
                
#                 print( pred_test.max() - pred_test.min())
                pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
            else:
                pred_test = model.predict(df_test[features])
                
            pred_tests.append(pred_test)
            
        if plot:
            plt.figure(figsize=(15, 6))
            plt.subplot(1, 2, 1)
            plt.scatter(pred, df_val['ScoreDiff'].values, s=5)
            plt.title('Prediction vs Score Diff')
            plt.grid(True)
            plt.subplot(1, 2, 2)
            sns.histplot(pred, bins=20)
            plt.title('Predictions probability repartition')
            plt.show()
        
        pred = (pred - pred.min()) / (pred.max() - pred.min())
        pred = np.clip(pred, 0, 1)

        score = ((df_val['WinA'].values - pred) ** 2).mean()
        cvs.append(score)

        if verbose:
            print(f'\t -> Scored {score:.3f}')
        
    print(f'\n Local CV is {np.mean(cvs):.3f}')
    
    return pred, df_val['WinA'].values

def objective(trial):
    cat_params = dict(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        depth=trial.suggest_int("depth", 3, 12),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bagging_temperature=trial.suggest_float('bagging_temperature', 0, 2.5),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        task_type='CPU',
        early_stopping_rounds=200,
        verbose=False
    )
    
    model = CatBoostRegressor(**cat_params)

    
    y_pred , y_val = kfold(df, model, df_test, plot=False, verbose=1, mode="cls")
    score = brier_score_loss(y_val, y_pred)
    
    return score



study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

[I 2025-03-06 19:46:00,929] A new study created in memory with name: no-name-3b0c0b5b-1576-42f8-ab46-73126a8addbf



Validating on season 2017
	 -> Scored 0.194

Validating on season 2018
	 -> Scored 0.225

Validating on season 2019
	 -> Scored 0.156

Validating on season 2021
	 -> Scored 0.195

Validating on season 2022
	 -> Scored 0.192

Validating on season 2023
	 -> Scored 0.192

Validating on season 2024


[I 2025-03-06 19:47:28,837] Trial 0 finished with value: 0.15792680205414228 and parameters: {'iterations': 615, 'learning_rate': 0.0006225217882044358, 'depth': 11, 'l2_leaf_reg': 0.0017041342410494256, 'bagging_temperature': 0.6692158753461799, 'random_strength': 1.0116188915021114e-07}. Best is trial 0 with value: 0.15792680205414228.


	 -> Scored 0.158

 Local CV is 0.187

Validating on season 2017
	 -> Scored 0.198

Validating on season 2018
	 -> Scored 0.224

Validating on season 2019
	 -> Scored 0.165

Validating on season 2021
	 -> Scored 0.190

Validating on season 2022
	 -> Scored 0.198

Validating on season 2023
	 -> Scored 0.199

Validating on season 2024


[I 2025-03-06 19:47:41,424] Trial 1 finished with value: 0.15822119467829693 and parameters: {'iterations': 728, 'learning_rate': 0.011504783036411457, 'depth': 8, 'l2_leaf_reg': 0.0001747519372342445, 'bagging_temperature': 2.0079480117647472, 'random_strength': 0.02455817867233486}. Best is trial 0 with value: 0.15792680205414228.


	 -> Scored 0.158

 Local CV is 0.190

Validating on season 2017
	 -> Scored 0.182

Validating on season 2018
	 -> Scored 0.206

Validating on season 2019
	 -> Scored 0.152

Validating on season 2021
	 -> Scored 0.198

Validating on season 2022
	 -> Scored 0.196

Validating on season 2023
	 -> Scored 0.187

Validating on season 2024


[I 2025-03-06 19:47:48,780] Trial 2 finished with value: 0.15962256312405257 and parameters: {'iterations': 953, 'learning_rate': 0.0001046830467603985, 'depth': 6, 'l2_leaf_reg': 3.010696916010318e-07, 'bagging_temperature': 0.16371444383682682, 'random_strength': 0.0015491854121210192}. Best is trial 0 with value: 0.15792680205414228.


	 -> Scored 0.160

 Local CV is 0.183

Validating on season 2017
	 -> Scored 0.184

Validating on season 2018
	 -> Scored 0.213

Validating on season 2019
	 -> Scored 0.157

Validating on season 2021
	 -> Scored 0.187

Validating on season 2022
	 -> Scored 0.193

Validating on season 2023
	 -> Scored 0.192

Validating on season 2024


[I 2025-03-06 19:47:59,591] Trial 3 finished with value: 0.15622429996411574 and parameters: {'iterations': 611, 'learning_rate': 0.012821729086031637, 'depth': 8, 'l2_leaf_reg': 26.53469531642888, 'bagging_temperature': 2.3876670124946524, 'random_strength': 1.9609089528199352e-07}. Best is trial 3 with value: 0.15622429996411574.


	 -> Scored 0.156

 Local CV is 0.183

Validating on season 2017
	 -> Scored 0.178

Validating on season 2018
	 -> Scored 0.215

Validating on season 2019
	 -> Scored 0.164

Validating on season 2021
	 -> Scored 0.187

Validating on season 2022
	 -> Scored 0.192

Validating on season 2023
	 -> Scored 0.194

Validating on season 2024


[I 2025-03-06 19:48:39,388] Trial 4 finished with value: 0.15542456744323466 and parameters: {'iterations': 525, 'learning_rate': 0.00984502939636966, 'depth': 10, 'l2_leaf_reg': 0.0798854706139778, 'bagging_temperature': 2.475599644729008, 'random_strength': 1.2022912205499667}. Best is trial 4 with value: 0.15542456744323466.


	 -> Scored 0.155

 Local CV is 0.184


In [85]:
best_params = study.best_params

final_model = CatBoostRegressor(**best_params)
final_model

def kfold(df, model, df_test_=None, plot=False, verbose=0, mode="reg"):
    seasons = df['Season'].unique()
    cvs = []
    pred_tests = []
    target = "ScoreDiff" if mode == "reg" else "WinA"
    
    for season in seasons[1:]:
        if verbose:
            print(f'\nValidating on season {season}')
        
        df_train = df[df['Season'] < season].reset_index(drop=True).copy()
        df_val = df[df['Season'] == season].reset_index(drop=True).copy()
        df_test = df_test_.copy()
        df_train.fillna(0, inplace=True)  #### add this code for fixing a bug
        df_val.fillna(0, inplace=True)    #### add this code for fixing a bug
        df_test.fillna(0, inplace=True)   #### add this code for fixing a bug
        df_train, df_val, df_test = rescale(features, df_train, df_val, df_test)
        
        if mode == "reg":
#             model = LinearRegression()
            model = ElasticNet(alpha=1, l1_ratio=0.5)
        else:
            model = model

        model.fit(df_train[features], df_train[target])
        
        if mode == "reg":
            pred = model.predict(df_val[features])
        else:
            pred = model.predict(df_val[features])
        
        if df_test is not None:
            if mode == "reg":
                pred_test = model.predict(df_test[features])
                
#                 print( pred_test.max() - pred_test.min())
                pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
            else:
                pred_test = model.predict(df_test[features])
                
            pred_tests.append(pred_test)
            
        if plot:
            plt.figure(figsize=(15, 6))
            plt.subplot(1, 2, 1)
            plt.scatter(pred, df_val['ScoreDiff'].values, s=5)
            plt.title('Prediction vs Score Diff')
            plt.grid(True)
            plt.subplot(1, 2, 2)
            sns.histplot(pred, bins=20)
            plt.title('Predictions probability repartition')
            plt.show()
        
        pred = (pred - pred.min()) / (pred.max() - pred.min())
        pred = np.clip(pred, 0, 1)

        score = ((df_val['WinA'].values - pred) ** 2).mean()
        cvs.append(score)

        if verbose:
            print(f'\t -> Scored {score:.3f}')
        
    print(f'\n Local CV is {np.mean(cvs):.3f}')
    
    return pred_tests

pred_tests = kfold(df, final_model, df_test, plot=False, verbose=1, mode="cls")
pred_tests_s2 = kfold(df, final_model, df_test_s2, plot=False, verbose=1, mode="cls")


Validating on season 2017
0:	learn: 0.4975666	total: 9.29ms	remaining: 4.87s
1:	learn: 0.4954350	total: 17.8ms	remaining: 4.65s
2:	learn: 0.4937790	total: 22.6ms	remaining: 3.93s
3:	learn: 0.4915688	total: 31.1ms	remaining: 4.04s
4:	learn: 0.4903047	total: 32ms	remaining: 3.32s
5:	learn: 0.4878538	total: 40.7ms	remaining: 3.52s
6:	learn: 0.4858390	total: 49.5ms	remaining: 3.66s
7:	learn: 0.4842507	total: 58.2ms	remaining: 3.76s
8:	learn: 0.4819339	total: 66.7ms	remaining: 3.82s
9:	learn: 0.4798157	total: 75.2ms	remaining: 3.87s
10:	learn: 0.4777290	total: 84.5ms	remaining: 3.95s
11:	learn: 0.4760248	total: 93.4ms	remaining: 3.99s
12:	learn: 0.4741677	total: 102ms	remaining: 4.03s
13:	learn: 0.4721719	total: 112ms	remaining: 4.07s
14:	learn: 0.4702042	total: 121ms	remaining: 4.1s
15:	learn: 0.4683985	total: 131ms	remaining: 4.16s
16:	learn: 0.4665927	total: 140ms	remaining: 4.2s
17:	learn: 0.4649827	total: 149ms	remaining: 4.18s
18:	learn: 0.4631340	total: 158ms	remaining: 4.2s
19:	lea

In [None]:
#1から0でクリップしたほうがいいかもしれない

pred_test = np.mean(pred_tests, 0)
df_test['pred'] = pred_test

final_sub = df_test[['ID', 'pred']].copy()
final_sub.to_csv('submissionStage1.csv', index=False)

display(final_sub)

pred_test_s2 = np.mean(pred_tests_s2, 0)
df_test_s2['pred'] = pred_test_s2

final_sub_s2 = df_test_s2[['ID', 'pred']].copy()
final_sub_s2.to_csv('submissionStage2.csv', index=False)

display(final_sub_s2)

Unnamed: 0,ID,pred
0,2021_1101_1102,0.319430
1,2021_1101_1103,0.253045
2,2021_1101_1104,0.282571
3,2021_1101_1105,0.320934
4,2021_1101_1106,0.319781
...,...,...
507103,2024_3475_3477,0.595369
507104,2024_3475_3478,0.588731
507105,2024_3476_3477,0.470971
507106,2024_3476_3478,0.463063


Unnamed: 0,ID,pred
0,2025_1101_1102,0.505870
1,2025_1101_1103,0.410217
2,2025_1101_1104,0.318974
3,2025_1101_1105,0.524680
4,2025_1101_1106,0.495735
...,...,...
131402,2025_3477_3479,0.479208
131403,2025_3477_3480,0.498398
131404,2025_3478_3479,0.457897
131405,2025_3478_3480,0.472242
