<a href="https://colab.research.google.com/github/nagasora/NeurIPS---Open-Polymer-Prediction-2025/blob/main/NeurIPS_%7C_Baseline_XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [4]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

neurips_open_polymer_prediction_2025_path = kagglehub.competition_download('neurips-open-polymer-prediction-2025')
minatoyukinaxlisa_tc_smiles_path = kagglehub.dataset_download('minatoyukinaxlisa/tc-smiles')
dmitryuarov_smiles_extra_data_path = kagglehub.dataset_download('dmitryuarov/smiles-extra-data')
dmitryuarov_rdkit_2025_3_3_path = kagglehub.notebook_output_download('dmitryuarov/rdkit-2025-3-3')

print('Data source import complete.')


Data source import complete.


In [5]:
print(f'neurips_open_polymer_prediction_2025_path: {neurips_open_polymer_prediction_2025_path}')
print(f'minatoyukinaxlisa_tc_smiles_path: {minatoyukinaxlisa_tc_smiles_path}')
print(f'dmitryuarov_smiles_extra_data_path: {dmitryuarov_smiles_extra_data_path}')
print(f'dmitryuarov_rdkit_2025_3_3_path: {dmitryuarov_rdkit_2025_3_3_path}')

neurips_open_polymer_prediction_2025_path: /root/.cache/kagglehub/competitions/neurips-open-polymer-prediction-2025
minatoyukinaxlisa_tc_smiles_path: /kaggle/input/tc-smiles
dmitryuarov_smiles_extra_data_path: /kaggle/input/smiles-extra-data
dmitryuarov_rdkit_2025_3_3_path: /root/.cache/kagglehub/notebooks/dmitryuarov/rdkit-2025-3-3/output/versions/4


In [6]:
!pip install optuna



In [7]:
!pip install xgboost==1.7.6




In [8]:
!pip install rdkit
!pip install networkx



In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import polars as pl
from typing import Callable, Dict, List, Tuple
import gc
import pickle

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import KFold

import networkx as nx
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops

In [10]:
class CFG:
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    SEED = 42
    FOLDS = 5
    PATH = '/root/.cache/kagglehub/competitions/neurips-open-polymer-prediction-2025/'

train = pd.read_csv(CFG.PATH + 'train.csv')
test = pd.read_csv(CFG.PATH + 'test.csv')

def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan

train['SMILES'] = train['SMILES'].apply(lambda s: make_smile_canonical(s))
test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))

In [11]:
# https://www.kaggle.com/datasets/minatoyukinaxlisa/tc-smiles
data_tc = pd.read_csv('/kaggle/input/tc-smiles/Tc_SMILES.csv')
data_tc = data_tc.rename(columns={'TC_mean': 'Tc'})

# https://springernature.figshare.com/articles/dataset/dataset_with_glass_transition_temperature/24219958?file=42507037
data_tg2 = pd.read_csv('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv', usecols=['SMILES', 'Tg (C)'])
data_tg2 = data_tg2.rename(columns={'Tg (C)': 'Tg'})

# https://www.sciencedirect.com/science/article/pii/S2590159123000377#ec0005
data_tg3 = pd.read_excel('/kaggle/input/smiles-extra-data/data_tg3.xlsx')
data_tg3 = data_tg3.rename(columns={'Tg [K]': 'Tg'})
data_tg3['Tg'] = data_tg3['Tg'] - 273.15

# https://github.com/Duke-MatSci/ChemProps
data_dnst = pd.read_excel('/kaggle/input/smiles-extra-data/data_dnst1.xlsx')
data_dnst = data_dnst.rename(columns={'density(g/cm3)': 'Density'})[['SMILES', 'Density']]
data_dnst['SMILES'] = data_dnst['SMILES'].apply(lambda s: make_smile_canonical(s))
data_dnst = data_dnst[(data_dnst['SMILES'].notnull())&(data_dnst['Density'].notnull())&(data_dnst['Density'] != 'nylon')]
data_dnst['Density'] = data_dnst['Density'].astype('float64')
data_dnst['Density'] -= 0.118

def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])

    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]

    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, data_tc, 'Tc')
train = add_extra_data(train, data_tg2, 'Tg')
train = add_extra_data(train, data_tg3, 'Tg')
train = add_extra_data(train, data_dnst, 'Density')

print('\n'*3, '--- SMILES for training ---', )
for t in CFG.TARGETS:
    print(f'"{t}": {len(train[train[t].notnull()])}')

[00:31:27] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[00:31:27] SMILES Parse Error: check for mistakes around position 12:
[00:31:27] *O[Si](*)([R])[R]
[00:31:27] ~~~~~~~~~~~^
[00:31:27] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[00:31:27] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[00:31:27] SMILES Parse Error: check for mistakes around position 28:
[00:31:27] c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=
[00:31:27] ~~~~~~~~~~~~~~~~~~~~^
[00:31:27] SMILES Parse Error: extra open parentheses while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[00:31:27] SMILES Parse Error: check for mistakes around position 20:
[00:31:27] *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)
[00:31:27] ~~~~~~~~~~~~~~~~~~~~^
[00:31:27] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2cc


For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 151 new samples!
New unique SMILES: 136

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Density" added 634 new samples!
New unique SMILES: 524



 --- SMILES for training ---
"Tg": 1161
"FFV": 7030
"Tc": 866
"Density": 1247
"Rg": 614


In [12]:
# 既存のライブラリインポートに加えて、以下を想定
from rdkit.Chem import MACCSkeys, Fragments, Descriptors3D
from rdkit.Chem import rdMolDescriptors
from tqdm.auto import tqdm # 進捗表示をリッチにする

# -----------------------------------------------------------------------------
# 1. 特徴量生成クラスの実装
# -----------------------------------------------------------------------------
class FeatureGenerator:
    """
    SMILESから多角的な特徴量を生成するクラス。
    ノートブックのロジックと我々の追加機能を統合。
    """
    def __init__(self, n_bits: int = 2048, radius: int = 2):
        """
        Args:
            n_bits (int): フィンガープリントのビット数。
            radius (int): Morgan Fingerprintの半径。
        """
        self.n_bits = n_bits
        self.radius = radius

        # ノートブックで定義された不要なカラムリスト
        self.useless_cols = [
            'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
            'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
            'NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'fr_barbitur',
            'fr_benzodiazepine', 'fr_dihydropyridine', 'fr_epoxide',
            'fr_isothiocyan', 'fr_lactam', 'fr_nitroso', 'fr_prisulfonamd',
            'fr_thiocyan', 'MaxEStateIndex', 'HeavyAtomMolWt', 'ExactMolWt',
            'NumValenceElectrons', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n',
            'Chi1v', 'Chi2n', 'Kappa1', 'LabuteASA', 'HeavyAtomCount', 'MolMR',
            'Chi3n', 'BertzCT', 'Chi2v', 'Chi4n', 'HallKierAlpha', 'Chi3v',
            'Chi4v', 'MinAbsPartialCharge', 'MinPartialCharge',
            'MaxAbsPartialCharge', 'FpDensityMorgan2', 'FpDensityMorgan3',
            'Phi', 'Kappa3', 'fr_nitrile', 'SlogP_VSA6',
            'NumAromaticCarbocycles', 'NumAromaticRings', 'fr_benzene',
            'VSA_EState6', 'NOCount', 'fr_C_O', 'fr_C_O_noCOO', 'NumHDonors',
            'fr_amide', 'fr_Nhpyrrole', 'fr_phenol', 'fr_phenol_noOrthoHbond',
            'fr_COO2', 'fr_halogen', 'fr_diazo', 'fr_nitro_arom',
            'fr_phos_ester'
        ]
        self.desc_names = [
            desc[0] for desc in Descriptors.descList
            if desc[0] not in self.useless_cols
        ]
        self.descriptor_3d_names = [
            'Asphericity', 'Eccentricity', 'InertialShapeFactor',
            'NPR1', 'NPR2', 'PMI1', 'PMI2', 'PMI3', 'RadiusOfGyration',
            'SpherocityIndex'
        ]

    def _smiles_to_mol(self, smiles: str) -> Chem.Mol | None:
        """SMILESをMolオブジェクトに変換（ノートブックの正準化は適用済みと仮定）"""
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                Chem.SanitizeMol(mol)
            return mol
        except (ValueError, TypeError):
            return None

    def _compute_descriptors(self, mol: Chem.Mol | None) -> List[float]:
        """RDKit基本記述子"""
        if mol is None: return [np.nan] * len(self.desc_names)
        return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in self.useless_cols]

    def _compute_graph_features(self, mol: Chem.Mol | None) -> Dict[str, float]:
        """グラフ特徴量"""
        if mol is None:
            return {'graph_diameter': np.nan, 'avg_shortest_path': np.nan, 'num_cycles': np.nan}
        adj = rdmolops.GetAdjacencyMatrix(mol)
        G = nx.from_numpy_array(adj)
        is_connected = nx.is_connected(G)
        return {
            'graph_diameter': nx.diameter(G) if is_connected else 0,
            'avg_shortest_path': nx.average_shortest_path_length(G) if is_connected else 0,
            'num_cycles': len(list(nx.cycle_basis(G)))
        }

    def _compute_morgan_fp(self, mol: Chem.Mol | None) -> np.ndarray:
        """Morgan Fingerprint"""
        if mol is None: return np.zeros(self.n_bits, dtype=int)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.n_bits)
        return np.array(list(fp.ToBitString())).astype(int)

    def _compute_maccs_fp(self, mol: Chem.Mol | None) -> np.ndarray:
        """MACCS Keys Fingerprint"""
        if mol is None: return np.zeros(167, dtype=int)
        fp = MACCSkeys.GenMACCSKeys(mol)
        return np.array(list(fp.ToBitString())).astype(int)

    def _compute_atom_pair_fp(self, mol: Chem.Mol | None) -> np.ndarray:
        """Atom-Pair Fingerprint"""
        if mol is None: return np.zeros(self.n_bits, dtype=int)
        fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=self.n_bits)
        return np.array(list(fp.ToBitString())).astype(int)

    # FeatureGeneratorクラス内に新しいメソッドを追加
    def _extract_monomer_smiles(self, polymer_smiles: str | None) -> str | None:
        """ポリマーSMILESからモノマーSMILESを抽出する単純な試み"""
        if polymer_smiles is None:
            return None
        # '*'を除去することでモノマー単位を近似的に表現する
        # Why not: 正規表現など複雑な方法を使わない。
        # このコンペのSMILESの多くは単純な形式であり、アスタリスク除去で十分機能する。
        # 複雑にしすぎると、予期せぬエッジケースで失敗するリスクがある。
        return polymer_smiles.replace('*', '')

    def _compute_monomer_features(self, polymer_smiles: str | None) -> List[float]:
        """
        モノマー単位のSMILESからRDKit基本記述子を計算する。
        """
        monomer_smiles = self._extract_monomer_smiles(polymer_smiles)
        if not monomer_smiles:
            return [np.nan] * len(self.desc_names)

        monomer_mol = self._smiles_to_mol(monomer_smiles)
        return self._compute_descriptors(monomer_mol) # 既存の記述子計算メソッドを再利用


    def generate_features(self, df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
        """
        全てのFEパイプラインを実行するメインメソッド。

        Args:
            df (pd.DataFrame): 'SMILES'カラムを持つ入力データフレーム。
            is_train (bool): 学習データかどうか。進捗表示のラベルに使用。

        Returns:
            pd.DataFrame: 生成された全ての特徴量を含むデータフレーム。
        """
        label = "Train" if is_train else "Test"
        print(f"--- Generating features for {label} data ---")

        # tqdmを使って進捗を可視化
        mols = [self._smiles_to_mol(smi) for smi in tqdm(df['SMILES'], desc=f"  Parsing SMILES...")]

        # 各特徴量カテゴリーを計算
        descs = [self._compute_descriptors(mol) for mol in tqdm(mols, desc=f"  Calculating Descriptors...")]
        graphs = [self._compute_graph_features(mol) for mol in tqdm(mols, desc=f"  Calculating Graph Features...")]
        morgan_fps = [self._compute_morgan_fp(mol) for mol in tqdm(mols, desc=f"  Calculating Morgan FP...")]
        maccs_fps = [self._compute_maccs_fp(mol) for mol in tqdm(mols, desc=f"  Calculating MACCS FP...")]
        ap_fps = [self._compute_atom_pair_fp(mol) for mol in tqdm(mols, desc=f"  Calculating Atom-Pair FP...")]
        print("Calculating monomer features...")
        monomer_descs = [self._compute_monomer_features(smi) for smi in tqdm(df['SMILES'], desc=f"  Calculating Monomer Features...")]
        print("Done!")

        # データフレームに変換
        df_descs = pd.DataFrame(descs, columns=self.desc_names)
        df_graphs = pd.DataFrame(graphs)
        df_morgan = pd.DataFrame(morgan_fps).add_prefix('morgan_')
        df_maccs = pd.DataFrame(maccs_fps).add_prefix('maccs_')
        df_ap = pd.DataFrame(ap_fps).add_prefix('apair_')
        df_monomer = pd.DataFrame(monomer_descs, columns=[f"monomer_{name}" for name in self.desc_names])

        # 全てを結合
        result_df = pd.concat([df_descs, df_graphs, df_morgan, df_maccs, df_ap, df_monomer], axis=1)

        # 無限大をNaNに置換
        result_df = result_df.replace([-np.inf, np.inf], np.nan)

        # メモリ解放
        del mols, descs, graphs, morgan_fps, maccs_fps, ap_fps, monomer_descs
        gc.collect()

        return result_df

In [13]:
feature_generator = FeatureGenerator(n_bits=1024) # ビット数を少し減らして次元数を抑制

train_feats = feature_generator.generate_features(train, is_train=True)
test_feats = feature_generator.generate_features(test, is_train=False)

# train/testデータフレームに特徴量を結合
train = pd.concat([train, train_feats], axis=1)
test = pd.concat([test, test_feats], axis=1)

# メモリ解放
del train_feats, test_feats
gc.collect()

# カラム名が文字列であることを確認（LightGBMエラー対策）
train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]
test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test.columns]

# --- これ以降の、ターゲットごとの定数カラム除去やモデル学習のロジックはそのまま利用できます ---

--- Generating features for Train data ---


  Parsing SMILES...:   0%|          | 0/9261 [00:00<?, ?it/s]

  Calculating Descriptors...:   0%|          | 0/9261 [00:00<?, ?it/s]

  Calculating Graph Features...:   0%|          | 0/9261 [00:00<?, ?it/s]

  Calculating Morgan FP...:   0%|          | 0/9261 [00:00<?, ?it/s]



  Calculating MACCS FP...:   0%|          | 0/9261 [00:00<?, ?it/s]

  Calculating Atom-Pair FP...:   0%|          | 0/9261 [00:00<?, ?it/s]

Calculating monomer features...




  Calculating Monomer Features...:   0%|          | 0/9261 [00:00<?, ?it/s]

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[00:33:50] ~~~~~~~~~~~~~~~~~~~~^
[00:33:50] SMILES Parse Error: extra open parentheses while parsing: C(=O)c1ccc2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O)c5ccc()cc5C4=O)cc3)(C(F)(F)F)C(F)(F)F)cc1)C2=O
[00:33:50] SMILES Parse Error: check for mistakes around position 23:
[00:33:50] =O)c1ccc2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O
[00:33:50] ~~~~~~~~~~~~~~~~~~~~^
[00:33:50] SMILES Parse Error: extra open parentheses while parsing: C(=O)c1ccc2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O)c5ccc()cc5C4=O)cc3)(C(F)(F)F)C(F)(F)F)cc1)C2=O
[00:33:50] SMILES Parse Error: check for mistakes around position 29:
[00:33:50] cc2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O)c5ccc
[00:33:50] ~~~~~~~~~~~~~~~~~~~~^
[00:33:50] SMILES Parse Error: extra open parentheses while parsing: C(=O)c1ccc2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O)c5ccc()cc5C4=O)cc3)(C(F)(F)F)C(F)(F)F)cc1)C2=O
[00:33:50] SMILES Parse Error: check for mistakes around position 31:
[00:33:50] 2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O)c5ccc()
[00:33

Done!
--- Generating features for Test data ---


  Parsing SMILES...:   0%|          | 0/3 [00:00<?, ?it/s]

  Calculating Descriptors...:   0%|          | 0/3 [00:00<?, ?it/s]

  Calculating Graph Features...:   0%|          | 0/3 [00:00<?, ?it/s]

  Calculating Morgan FP...:   0%|          | 0/3 [00:00<?, ?it/s]



  Calculating MACCS FP...:   0%|          | 0/3 [00:00<?, ?it/s]

  Calculating Atom-Pair FP...:   0%|          | 0/3 [00:00<?, ?it/s]

Calculating monomer features...




  Calculating Monomer Features...:   0%|          | 0/3 [00:00<?, ?it/s]

Done!


[00:34:05] SMILES Parse Error: syntax error while parsing: Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc()cc4)(C(F)(F)F)C(F)(F)F)cc3)cc2)cc1
[00:34:05] SMILES Parse Error: check for mistakes around position 35:
[00:34:05] 2ccc(Oc3ccc(C(c4ccc()cc4)(C(F)(F)F)C(F)(F
[00:34:05] ~~~~~~~~~~~~~~~~~~~~^
[00:34:05] SMILES Parse Error: extra open parentheses while parsing: Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc()cc4)(C(F)(F)F)C(F)(F)F)cc3)cc2)cc1
[00:34:05] SMILES Parse Error: check for mistakes around position 7:
[00:34:05] Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc()cc4)(C
[00:34:05] ~~~~~~^
[00:34:05] SMILES Parse Error: extra open parentheses while parsing: Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc()cc4)(C(F)(F)F)C(F)(F)F)cc3)cc2)cc1
[00:34:05] SMILES Parse Error: check for mistakes around position 19:
[00:34:05] Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc()cc4)(C
[00:34:05] ~~~~~~~~~~~~~~~~~~^
[00:34:05] SMILES Parse Error: extra open parentheses while parsing: Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc()cc4)(C(F)(F)F)C(F)(F)F)cc3)cc2)cc1
[00

In [14]:
# Find constant columns for each target
all_features = train.columns[7:].tolist()
features = {}
for target in CFG.TARGETS:
    const_descs = []
    for col in train.columns.drop(CFG.TARGETS):
        if train[train[target].notnull()][col].nunique() == 1:
            const_descs.append(col)
    features[target] = [f for f in all_features if f not in const_descs]

In [34]:
import optuna

# -----------------------------------------------------------------------------
# 1. XGBoost用ハイパーパラメータチューニングの実装
# -----------------------------------------------------------------------------
def mae(y_true, y_pred):
    return sum(abs(true - pred) for true, pred in zip(y_true, y_pred)) / len(y_true)

def objective_xgb(trial: optuna.trial.Trial, X: pd.DataFrame, y: pd.Series) -> float:
    params = {
        'objective': 'reg:absoluteerror',
        'eval_metric': 'mae',
        'verbosity': 0,
        'booster': 'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'seed': CFG.SEED,
        'nthread': -1,
    }

    scores = []
    kf = KFold(n_splits=CFG.FOLDS, shuffle=True, random_state=CFG.SEED)

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        # DMatrixに変換
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        # 学習
        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=10000,
            evals=[(dval, "eval")],
            early_stopping_rounds=100,
            verbose_eval=False
        )

        # 予測
        val_preds = model.predict(dval)
        score = mae(y_val.to_numpy(), val_preds)
        scores.append(score)

    return np.mean(scores)


# 各ターゲットの最適パラメータを格納する辞書
best_params_per_target_xgb = {}

# チューニング実行
for target in CFG.TARGETS:
  if target == 'Tg':
    print(f"\n--- Tuning XGBoost for target: {target} ---")
    best_params = {'lambda': 3.3425781458720976, 'alpha': 0.002499666869870832, 'colsample_bytree': 0.8098224012315848, 'subsample': 0.6924582651213939, 'learning_rate': 0.058219959577282134, 'n_estimators': 8452, 'max_depth': 7, 'min_child_weight': 1}
    best_params_per_target_xgb[target] = best_params
    print(f"  Best params for {target}: {best_params}")

  else:
    print(f"\n--- Tuning XGBoost for target: {target} ---")
    train_part = train[train[target].notnull()].reset_index(drop=True)
    X = train_part[features[target]]
    y = train_part[target]

    # --- X の NaN・inf 除去 ---
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())

    # --- y の NaN・inf 除去 + 型変換 ---
    y = y.replace([np.inf, -np.inf], np.nan)
    y = y.fillna(y.median())  # NaN が残っていれば中央値で埋める
    y = pd.to_numeric(y, errors='coerce')  # 数値に強制変換
    if y.isnull().any():
        print("❌ y に NaN が残っています")
        print(y[y.isnull()])
        raise ValueError("ターゲットに NaN が残っています")


    study = optuna.create_study(direction='minimize', study_name=f'xgb_tuning_{target}')
    study.optimize(lambda trial: objective_xgb(trial, X, y), n_trials=10) # Reduce n_trials for faster testing

    best_params = study.best_params
    best_params_per_target_xgb[target] = best_params
    print(f"  Best MAE for {target}: {study.best_value:.5f}")
    print(f"  Best params for {target}: {best_params}")

#Best MAE for Tg: 37.83110
  #Best params for Tg: {'lambda': 3.3425781458720976, 'alpha': 0.002499666869870832, 'colsample_bytree': 0.8098224012315848, 'subsample': 0.6924582651213939, 'learning_rate': 0.058219959577282134, 'n_estimators': 8452, 'max_depth': 7, 'min_child_weight': 1}


--- Tuning XGBoost for target: Tg ---
  Best params for Tg: {'lambda': 3.3425781458720976, 'alpha': 0.002499666869870832, 'colsample_bytree': 0.8098224012315848, 'subsample': 0.6924582651213939, 'learning_rate': 0.058219959577282134, 'n_estimators': 8452, 'max_depth': 7, 'min_child_weight': 1}

--- Tuning XGBoost for target: FFV ---


[I 2025-08-27 05:38:29,088] A new study created in memory with name: xgb_tuning_FFV
[W 2025-08-27 05:38:29,386] Trial 0 failed with parameters: {'lambda': 0.19552362800926842, 'alpha': 0.698253689146758, 'colsample_bytree': 0.7740746676103493, 'subsample': 0.9449300122908856, 'learning_rate': 0.004397890314285786, 'max_depth': 5, 'min_child_weight': 4} because of the following error: XGBoostError('[05:38:29] ../src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`\nStack trace:\n  [bt] (0) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x20b1e3) [0x7b0c4080b1e3]\n  [bt] (1) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x22ce51) [0x7b0c4082ce51]\n  [bt] (2) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x27e706) [0x7b0c4087e706]\n  [bt] (3) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x220215) [0x7b0c40820215]\n  [bt] (4) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libx

XGBoostError: [05:38:29] ../src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`
Stack trace:
  [bt] (0) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x20b1e3) [0x7b0c4080b1e3]
  [bt] (1) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x22ce51) [0x7b0c4082ce51]
  [bt] (2) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x27e706) [0x7b0c4087e706]
  [bt] (3) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x220215) [0x7b0c40820215]
  [bt] (4) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x329) [0x7b0c4074aeb9]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7b0cced5ce2e]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7b0cced59493]
  [bt] (7) /usr/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0x98c1) [0x7b0cced6b8c1]
  [bt] (8) /usr/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0x8ffe) [0x7b0cced6affe]



In [None]:
import pickle

# -----------------------------------------------------------------------------
# 2. 最適パラメータを使用したXGBoostモデル学習
# -----------------------------------------------------------------------------

for target in CFG.TARGETS:
    print(f'\n\nTARGET {target} (XGBoost)')
    train_part = train[train[target].notnull()].reset_index(drop=True)

    # OOF予測とテスト予測を保存するための新しいカラムをtrain/testに追加
    train[f'{target}_pred_xgb'] = 0
    test[f'{target}_xgb'] = 0 # test[target]はLGBMで使っているので別名にする

    oof_xgb = np.zeros(len(train_part))
    scores_xgb = []

    # チューニングで見つけた最適パラメータを取得
    final_params_xgb = best_params_per_target_xgb[target]
    final_params_xgb.update({
        'objective': 'reg:absoluteerror',
        'eval_metric': 'mae',
        'verbosity': 0,
        'booster': 'gbtree',
        'n_estimators': 10000, # Keep high for early stopping
        'seed': CFG.SEED,
        'n_jobs': -1,
    })

    kf = KFold(n_splits=CFG.FOLDS, shuffle=True, random_state=CFG.SEED)
    for i, (trn_idx, val_idx) in enumerate(kf.split(train_part, train_part[target])):
        print(f"\n--- Fold {i+1} ---")

        x_trn = train_part.loc[trn_idx, features[target]]
        y_trn = train_part.loc[trn_idx, target]
        x_val = train_part.loc[val_idx, features[target]]
        y_val = train_part.loc[val_idx, target]

        model_xgb = xgb.XGBRegressor(**final_params_xgb)
        model_xgb.fit(
            x_trn, y_trn,
            eval_set=[(x_val, y_val)],
            early_stopping_rounds=300, # Use early_stopping_rounds for 1.7.6
            verbose=False,
        )

        with open(f'/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/basemodel_ver2/XGB_model/xgb_{target}_fold_{i}.pkl', 'wb') as f:
            pickle.dump(model_xgb, f)

        val_preds = model_xgb.predict(x_val)
        score = mae(y_val, val_preds)
        scores_xgb.append(score)
        print(f'MAE: {np.round(score, 5)}')

        oof_xgb[val_idx] = val_preds
        test[f'{target}_xgb'] += model_xgb.predict(test[features[target]]) / CFG.FOLDS

    train.loc[train[target].notnull(), f'{target}_pred_xgb'] = oof_xgb

    print(f'\nMean MAE: {np.round(np.mean(scores_xgb), 5)}')
    print(f'Std MAE: {np.round(np.std(scores_xgb), 5)}')
    print('-'*30)

In [None]:
import pickle

# Save oof_xgb to Google Drive
output_path = '/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/basemodel_ver2/XGB_model/oof_xgb.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(oof_xgb, f)

print(f"oof_xgb saved to {output_path}")