<a href="https://colab.research.google.com/github/nagasora/NeurIPS---Open-Polymer-Prediction-2025/blob/main/xgb_lgbm_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

neurips_open_polymer_prediction_2025_path = kagglehub.competition_download('neurips-open-polymer-prediction-2025')
senkin13_rdkit_2025_3_3_cp311_path = kagglehub.dataset_download('senkin13/rdkit-2025-3-3-cp311')
minatoyukinaxlisa_tc_smiles_path = kagglehub.dataset_download('minatoyukinaxlisa/tc-smiles')

print('Data source import complete.')


In [None]:
print(f'neurips_open_polymer_prediction_2025_path: {neurips_open_polymer_prediction_2025_path}')

In [None]:
# ===================================================================
# ライブラリのインストール (Kaggle/Colab環境で最初に実行)
# ===================================================================
!pip install rdkit --quiet
!pip install xgboost --quiet
!pip install optuna --quiet
!pip install tqdm --quiet

In [None]:
# ===================================================================
# ライブラリのインポート
# ===================================================================
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Tuple
import warnings
import logging
import os

from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit.ML.Descriptors import MoleculeDescriptors

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
from tqdm.auto import tqdm
import optuna
import joblib


# ===================================================================
# ログ設定
# ===================================================================
# すべての警告を抑制
warnings.filterwarnings("ignore")

In [None]:
# ===================================================================
# ログ設定
# ===================================================================
# すべての警告を抑制
warnings.filterwarnings("ignore")

# 詳細なフォーマットでロギングを設定
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('polymer_prediction_xgb.log', mode='w')
    ]
)
logger = logging.getLogger(__name__)

# Optunaの冗長な出力を抑制
optuna.logging.set_verbosity(optuna.logging.WARNING)



In [None]:
# ===================================================================
# 3. 設定クラスと基本設定
# ===================================================================
class CFG:
    seed = 42
    n_splits = 5
    data_dir = '/root/.cache/kagglehub/competitions/neurips-open-polymer-prediction-2025/'
    supp_dir = '/root/.cache/kagglehub/competitions/neurips-open-polymer-prediction-2025/train_supplement/'
    output_dir = '/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/LGBM+XGB+GNN_CV/'
    lgbm_model_dir = os.path.join(output_dir, 'models_lgbm')
    xgb_model_dir = os.path.join(output_dir, 'models_xgb')
    targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

warnings.filterwarnings("ignore")
os.environ['LIGHTGBM_VERBOSITY'] = '-1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def seed_everything(seed: int):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)

def load_data(cfg: CFG) -> pd.DataFrame:
    """
    メインの学習データと追加データを読み込み、統合する関数。

    Args:
        cfg (CFG): データパスを含む設定オブジェクト

    Returns:
        pd.DataFrame: 全ての学習データを結合したデータフレーム
    """
    logger.info("データの読み込みを開始します...")

    # 1. メインの学習データを読み込み
    main_train_path = os.path.join(cfg.data_dir, 'train.csv')
    try:
        train_df = pd.read_csv(main_train_path)
        logger.info(f"メイン学習データを読み込みました。サンプル数: {len(train_df)}")
    except FileNotFoundError:
        logger.error(f"メイン学習データが見つかりません: {main_train_path}")
        # メインデータがない場合は空のDataFrameを返して終了
        return pd.DataFrame()

    # 2. 追加データの読み込みと整形
    try:
        # dataset1: Tc (熱伝導率) データ
        supp1_path = os.path.join(cfg.supp_dir, 'dataset1.csv')
        supp1 = pd.read_csv(supp1_path)[['SMILES', 'TC_mean']].rename(columns={'TC_mean': 'Tc'})

        # dataset3: Tg (ガラス転移温度) データ
        supp3_path = os.path.join(cfg.supp_dir, 'dataset3.csv')
        supp3 = pd.read_csv(supp3_path)[['SMILES', 'Tg']]

        # dataset4: FFV (自由体積分率) データ
        supp4_path = os.path.join(cfg.supp_dir, 'dataset4.csv')
        supp4 = pd.read_csv(supp4_path)[['SMILES', 'FFV']]

        # 3. 全てのデータフレームを結合
        train_df = pd.concat([train_df, supp1, supp3, supp4], ignore_index=True)

        # 重複するSMILESがあれば削除 (任意ですが、クリーニングとして推奨)
        train_df = train_df.drop_duplicates(subset=['SMILES']).reset_index(drop=True)

        logger.info(f"✅ 追加データを統合しました。総学習サンプル数: {len(train_df)}")

    except FileNotFoundError as e:
        logger.warning(f"⚠️ 追加データが見つかりませんでした: {e}")
        logger.warning("メインデータのみで学習を続行します。")

    return train_df

In [None]:
# ===================================================================
# ▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼ 修正箇所 ▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼
# 4. チューニング済みハイパーパラメータの定義
# ===================================================================
# xgb_finetune.ipynb から得られた最適なハイパーパラメータをここに記述します
# (これはサンプル値です。実際のチューニング結果に合わせて値を変更してください)
XGB_BEST_PARAMS = joblib.load("/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/LGBM+GNN/xgb_tuned/best_params_xgb.joblib")

# LGBMは固定の高性能パラメータを使用
LGBM_PARAMS = joblib.load('/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/LGBM+GNN/lgbm_tuned/best_params.joblib')
# ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲ 修正箇所 ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲

# ===================================================================
# 5. データ読み込みと特徴量抽出
# ===================================================================
# 特徴量抽出関数 (前回と同じコードのため省略)
# --- モデルごとに特徴量抽出関数を定義 ---

def extract_lgbm_features(smiles_list: List[str]) -> np.ndarray:
    """
    LGBMモデルの訓練時と同一の特徴量を生成する。
    (MorganFP + MACCS Keys + All Descriptors)
    """
    logger.info("LGBM用の特徴量を抽出中...")
    descriptor_names = [desc[0] for desc in Descriptors._descList]
    desc_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
    morgan_gen = GetMorganGenerator(radius=2, fpSize=1024)

    features = []
    num_features = 1024 + 167 + len(descriptor_names)

    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                features.append(np.zeros(num_features))
                continue

            # フィンガープリント計算 (Morgan + MACCS)
            morgan_fp = np.array(morgan_gen.GetFingerprintAsNumPy(mol))
            maccs_fp = np.array(MACCSkeys.GenMACCSKeys(mol))
            fp_features = np.concatenate([morgan_fp, maccs_fp])

            # 記述子計算
            desc_features = np.array(desc_calculator.CalcDescriptors(mol))

            all_features = np.concatenate([fp_features, desc_features])
            features.append(all_features)
        except Exception:
            features.append(np.zeros(num_features))

    feature_matrix = np.array(features)
    return np.nan_to_num(feature_matrix, nan=0.0, posinf=0.0, neginf=0.0)


def extract_xgb_features(smiles_list: List[str]) -> np.ndarray:
    """
    XGBoostモデルの訓練時と同一の特徴量を生成する。
    (MorganFP + All Descriptors)
    """
    logger.info("XGBoost用の特徴量を抽出中...")
    descriptor_names = [desc[0] for desc in Descriptors._descList]
    desc_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
    morgan_gen = GetMorganGenerator(radius=2, fpSize=1024)

    features = []
    num_features = 1024 + len(descriptor_names)

    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                features.append(np.zeros(num_features))
                continue

            # フィンガープリント計算 (Morganのみ)
            fp = np.array(morgan_gen.GetFingerprintAsNumPy(mol))

            # 記述子計算
            descs = np.array(desc_calculator.CalcDescriptors(mol))

            all_features = np.concatenate([fp, descs])
            features.append(all_features)
        except Exception:
            features.append(np.zeros(num_features))

    feature_matrix = np.array(features)
    return np.nan_to_num(feature_matrix, nan=0.0, posinf=0.0, neginf=0.0)


In [None]:
# ===================================================================
# 6. CV学習・推論クラス
# ===================================================================
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Tuple
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
class PolymerCVTrainer:
    def __init__(self, model_type: str, base_params: Dict[str, Any], model_save_dir: str, seed: int = 42):
        if model_type not in ['lgbm', 'xgb']:
            raise ValueError("model_typeは 'lgbm' または 'xgb' である必要があります")
        self.model_type = model_type
        self.base_params = base_params
        # ▼▼▼ 修正箇所 ▼▼▼
        # モデル保存用のディレクトリを初期化時に受け取る
        self.model_save_dir = model_save_dir
        os.makedirs(self.model_save_dir, exist_ok=True) # ディレクトリ作成
        # ▲▲▲ 修正箇所 ▲▲▲
        self.seed = seed

    def train_predict(self, train_df: pd.DataFrame, test_df: pd.DataFrame, targets: List[str], feature_extractor_func) -> Tuple[pd.DataFrame, pd.DataFrame]:
        X = feature_extractor_func(train_df['SMILES'].tolist())
        X_test = feature_extractor_func(test_df['SMILES'].tolist())

        oof_preds = pd.DataFrame(index=train_df.index, columns=targets, dtype=np.float32)
        test_preds_all_folds = np.zeros((len(test_df), len(targets), CFG.n_splits))

        kf = KFold(n_splits=CFG.n_splits, shuffle=True, random_state=self.seed)

        for i, target in enumerate(targets):
            logger.info(f"===== Processing Target: {target} =====")

            y = train_df[target]
            valid_indices = y.notna()
            X_target = X[valid_indices]
            y_target = y[valid_indices]

            # ▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼ 修正箇所 ▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼
            # モデルごとに、またターゲットごとにパラメータを設定
            model_params = self.base_params.copy()
            if self.model_type == 'xgb':
                # XGBoostの場合、ターゲット固有の最適パラメータで上書き
                model_params = XGB_BEST_PARAMS[target]
                model_params['random_state'] = self.seed
                model_params['objective'] = 'reg:absoluteerror'
                model_params['n_estimators'] = 2000 # 固定
            # ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲ 修正箇所 ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲

            for fold, (train_idx, val_idx) in enumerate(kf.split(X_target, y_target)):
                logger.info(f"--- Fold {fold+1}/{CFG.n_splits} ---")
                X_train, X_val = X_target[train_idx], X_target[val_idx]
                y_train, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]

                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)
                X_test_scaled = scaler.transform(X_test)

                if self.model_type == 'lgbm':
                    model_params = LGBM_PARAMS[target]
                    model = lgb.LGBMRegressor(**model_params)
                    model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
                else:
                    model = xgb.XGBRegressor(**model_params, early_stopping_rounds=50)
                    model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=False)

                # モデルの保存
                    # ▼▼▼ 以下の2行を追加 ▼▼▼
                # スケーラーの保存
                scaler_path = os.path.join(self.model_save_dir, f"{self.model_type}_{target}_fold_{fold}_scaler.joblib")
                joblib.dump(scaler, scaler_path)
                # ▲▲▲ 追加はここまで ▲▲▲
                model_path = os.path.join(self.model_save_dir, f"{self.model_type}_{target}_fold_{fold}.joblib")
                joblib.dump(model, model_path)
                logger.info(f"Model saved to: {model_path}")
                # ▲▲▲ 修正箇所 ▲▲▲
                val_preds = model.predict(X_val_scaled)
                oof_preds.loc[y_target.index[val_idx], target] = val_preds
                test_preds_all_folds[:, i, fold] = model.predict(X_test_scaled)

        final_test_preds = pd.DataFrame(test_preds_all_folds.mean(axis=2), columns=targets)

        for target in targets:
            score = mean_absolute_error(train_df[target].dropna(), oof_preds[target].dropna())
            logger.info(f"Local CV Score for {target}: {score:.4f}")

        return oof_preds, final_test_preds


In [None]:
full_train_df = load_data(CFG)
test_df = pd.read_csv(os.path.join(CFG.data_dir, 'test.csv'))

# --- LGBMのCV学習 ---
logger.info("========== Starting LGBM CV Training ==========")
lgbm_trainer = PolymerCVTrainer(model_type='lgbm', base_params=LGBM_PARAMS, model_save_dir=CFG.lgbm_model_dir, seed=CFG.seed)
# (LGBMの学習は時間がかかる場合コメントアウト)
oof_preds_lgbm, test_preds_lgbm = lgbm_trainer.train_predict(full_train_df, test_df, CFG.targets, extract_lgbm_features)

In [None]:
# --- XGBoostのCV学習 (ファインチューニング済みパラメータ使用) ---
logger.info("========== Starting XGBoost CV Training with Fine-Tuned Params ==========")
xgb_trainer = PolymerCVTrainer(model_type='xgb', base_params={}, model_save_dir=CFG.xgb_model_dir, seed=CFG.seed) # base_paramsは中で設定するので空でOK
oof_preds_xgb, test_preds_xgb = xgb_trainer.train_predict(full_train_df, test_df, CFG.targets, extract_xgb_features)


In [None]:
# --- 結果の保存 ---
oof_preds_lgbm.to_csv(os.path.join(CFG.output_dir, 'oof_preds_lgbm.csv'))
test_preds_lgbm.to_csv(os.path.join(CFG.output_dir, 'test_preds_lgbm.csv'))
oof_preds_xgb.to_csv(os.path.join(CFG.output_dir, 'oof_preds_xgb.csv'), index=False)
test_preds_xgb.to_csv(os.path.join(CFG.output_dir, 'test_preds_xgb.csv'), index=False)
logger.info("OOF予測とテスト予測をファイルに保存しました。")

In [None]:
# --- 提出ファイルの作成 (今回はXGB単体) ---
submission_df = pd.DataFrame({'id': test_df['id']})
for target in CFG.targets:
    submission_df[target] = test_preds_xgb[target]
submission_df.to_csv('submission.csv', index=False)
logger.info("✅ 提出ファイル 'submission.csv' を作成しました。")

In [None]:
submission_df.head()

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error

# CV実行時に保存したOOF予測ファイルを読み込む
oof_lgbm = pd.read_csv("/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/LGBM+XGB_CV/oof_preds_lgbm.csv")
oof_xgb = pd.read_csv("/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/LGBM+XGB_CV/oof_preds_xgb.csv")
train_df = pd.read_csv("/root/.cache/kagglehub/competitions/neurips-open-polymer-prediction-2025/train.csv") # 正解ラベルの読み込み

best_score = float('inf')
best_weight = 0

# 0.01刻みで最適な重みを探す
for w in np.arange(0, 1.01, 0.01):
    # 加重平均でOOF予測をブレンド
    oof_blend = w * oof_lgbm + (1 - w) * oof_xgb

    # スコアを計算 (ターゲットごとに計算し平均する)
    scores = []
    for target in ['Tg', 'FFV', 'Tc', 'Density', 'Rg']:
         # NaNを削除してスコアを計算
        y_true = train_df[target].dropna()
        y_pred = oof_blend[target].loc[y_true.index].dropna()
        scores.append(mean_absolute_error(y_true, y_pred))

    avg_score = np.mean(scores)

    if avg_score < best_score:
        best_score = avg_score
        best_weight = w

print(f"最適なLGBMの重み: {best_weight:.2f}")
print(f"最適なXGBの重み: {1-best_weight:.2f}")
print(f"予測されるベストスコア: {best_score:.5f}")