<a href="https://colab.research.google.com/github/nagasora/NeurIPS---Open-Polymer-Prediction-2025/blob/main/GNN_CV_ipynb_%E3%81%AE%E3%82%B3%E3%83%94%E3%83%BC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

neurips_open_polymer_prediction_2025_path = kagglehub.competition_download('neurips-open-polymer-prediction-2025')
senkin13_rdkit_2025_3_3_cp311_path = kagglehub.dataset_download('senkin13/rdkit-2025-3-3-cp311')
minatoyukinaxlisa_tc_smiles_path = kagglehub.dataset_download('minatoyukinaxlisa/tc-smiles')

print('Data source import complete.')


In [None]:
print(f'neurips_open_polymer_prediction_2025_path: {neurips_open_polymer_prediction_2025_path}')

In [None]:
# ===================================================================
# 1. ライブラリのインストール
# ===================================================================
!pip install rdkit --quiet
!pip install torch --quiet
!pip install torch_geometric --quiet

In [None]:
# ===================================================================
# 2. 必要なライブラリのインポート
# ===================================================================
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Tuple
import warnings
import logging
import os
import random
import joblib
import warnings
warnings.filterwarnings('ignore')


from rdkit import Chem
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
import torch.nn.functional as F # Import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.loader import DataLoader as PyGDataLoader
from torch_geometric.nn import GINConv, global_add_pool

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

In [None]:
# ===================================================================
# 3. 設定クラスと基本設定
# ===================================================================
class CFG:
    seed = 42
    n_splits = 5
    device = "cuda" if torch.cuda.is_available() else "cpu"
    data_dir = '/root/.cache/kagglehub/competitions/neurips-open-polymer-prediction-2025/'
    supp_dir = '/root/.cache/kagglehub/competitions/neurips-open-polymer-prediction-2025/train_supplement/'
    output_dir = '/content/drive/MyDrive/kaggle notebook/NeurIPS - Open Polymer Prediction 2025/LGBM+XGB+GNN_CV/'
    lgbm_model_dir = os.path.join(output_dir, 'models_lgbm')
    xgb_model_dir = os.path.join(output_dir, 'models_xgb')
    targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

    # Model Hyperparameters
    node_feature_dim = 40
    hidden_dim = 256
    output_dim = 5
    num_gnn_layers = 4
    dropout_rate = 0.2

    # Training Hyperparameters
    epochs = 100 # エポック数を増やす
    batch_size = 64
    learning_rate = 1e-4
    weight_decay = 1e-5
    patience = 10 # Early Stoppingのためのpatience

    targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_everything(CFG.seed)

warnings.filterwarnings("ignore")

In [None]:

def load_data(cfg: CFG) -> pd.DataFrame:
    """
    メインの学習データと追加データを読み込み、統合する関数。

    Args:
        cfg (CFG): データパスを含む設定オブジェクト

    Returns:
        pd.DataFrame: 全ての学習データを結合したデータフレーム
    """
    print("データの読み込みを開始します...")

    # 1. メインの学習データを読み込み
    main_train_path = os.path.join(cfg.data_dir, 'train.csv')
    try:
        train_df = pd.read_csv(main_train_path)
        print(f"メイン学習データを読み込みました。サンプル数: {len(train_df)}")
    except FileNotFoundError:
        print(f"メイン学習データが見つかりません: {main_train_path}")
        # メインデータがない場合は空のDataFrameを返して終了
        return pd.DataFrame()

    # 2. 追加データの読み込みと整形
    try:
        # dataset1: Tc (熱伝導率) データ
        supp1_path = os.path.join(cfg.supp_dir, 'dataset1.csv')
        supp1 = pd.read_csv(supp1_path)[['SMILES', 'TC_mean']].rename(columns={'TC_mean': 'Tc'})

        # dataset3: Tg (ガラス転移温度) データ
        supp3_path = os.path.join(cfg.supp_dir, 'dataset3.csv')
        supp3 = pd.read_csv(supp3_path)[['SMILES', 'Tg']]

        # dataset4: FFV (自由体積分率) データ
        supp4_path = os.path.join(cfg.supp_dir, 'dataset4.csv')
        supp4 = pd.read_csv(supp4_path)[['SMILES', 'FFV']]

        # 3. 全てのデータフレームを結合
        train_df = pd.concat([train_df, supp1, supp3, supp4], ignore_index=True)

        # 重複するSMILESがあれば削除 (任意ですが、クリーニングとして推奨)
        train_df = train_df.drop_duplicates(subset=['SMILES']).reset_index(drop=True)

        print(f"✅ 追加データを統合しました。総学習サンプル数: {len(train_df)}")

    except FileNotFoundError as e:
        print(f"⚠️ 追加データが見つかりませんでした: {e}")
        print("メインデータのみで学習を続行します。")

    return train_df

In [None]:
# ===================================================================
# SMILESからグラフへの変換
# ===================================================================
def smiles_to_graph(smiles: str) -> Data:
    """
    SMILES文字列をPyTorch GeometricのDataオブジェクトに変換する。

    Args:
        smiles (str): SMILES文字列

    Returns:
        Data: ノード特徴量、エッジインデックスを含むグラフオブジェクト
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # ノード（原子）特徴量の抽出
    atom_features = []
    for atom in mol.GetAtoms():
        feature = [
            atom.GetAtomicNum(),
            atom.GetDegree(),
            atom.GetFormalCharge(),
            atom.GetNumRadicalElectrons(),
            atom.GetHybridization(),
            atom.GetIsAromatic(),
            atom.GetTotalNumHs(),
        ]
        atom_features.append(feature)
    x = torch.tensor(atom_features, dtype=torch.float)

    # エッジ（結合）情報の抽出
    edge_indices = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_indices.extend([[i, j], [j, i]]) # 無向グラフ
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

    # 特徴量ベクトルの次元を合わせるためのパディング
    if x.shape[1] < CFG.node_feature_dim:
        padding = torch.zeros(x.shape[0], CFG.node_feature_dim - x.shape[1])
        x = torch.cat([x, padding], dim=1)

    return Data(x=x, edge_index=edge_index)


# ===================================================================
# GNNモデルの定義
# ===================================================================
class PolymerGNN(nn.Module):
    """
    ポリマー物性予測のためのGNNモデル
    """
    def __init__(self):
        super(PolymerGNN, self).__init__()
        # GIN畳み込み層
        self.convs = nn.ModuleList()
        self.batch_norms = nn.ModuleList()

        self.convs.append(GINConv(nn.Sequential(
            nn.Linear(CFG.node_feature_dim, CFG.hidden_dim),
            nn.ReLU(),
            nn.Linear(CFG.hidden_dim, CFG.hidden_dim),
        )))
        self.batch_norms.append(nn.BatchNorm1d(CFG.hidden_dim))

        for _ in range(CFG.num_gnn_layers - 1):
            self.convs.append(GINConv(nn.Sequential(
                nn.Linear(CFG.hidden_dim, CFG.hidden_dim),
                nn.ReLU(),
                nn.Linear(CFG.hidden_dim, CFG.hidden_dim),
            )))
            self.batch_norms.append(nn.BatchNorm1d(CFG.hidden_dim))

        # 出力層（MLP）
        self.mlp = nn.Sequential(
            nn.Linear(CFG.hidden_dim, CFG.hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(CFG.dropout_rate),
            nn.Linear(CFG.hidden_dim // 2, CFG.output_dim)
        )

    def forward(self, data: Data) -> torch.Tensor:
        x, edge_index, batch = data.x, data.edge_index, data.batch

        for conv, bn in zip(self.convs, self.batch_norms):
            x = F.relu(conv(x, edge_index))
            x = bn(x)

        # グラフ全体の表現ベクトルを計算
        x_pooled = global_add_pool(x, batch)

        # MLPで最終的な出力を計算
        out = self.mlp(x_pooled)
        return out


# ===================================================================
# 4. Datasetと損失関数の再定義（スケーリング対応）
# ===================================================================
class PolymerDataset(Dataset):
    def __init__(self, df: pd.DataFrame, target_scalers: Dict[str, StandardScaler] = None):
        self.smiles = df['SMILES'].values
        self.labels_original = df[CFG.targets].values
        self.target_scalers = target_scalers

    def __len__(self) -> int:
        return len(self.smiles)

    def __getitem__(self, idx: int) -> Tuple:
        graph = smiles_to_graph(self.smiles[idx])
        labels = self.labels_original[idx]

        if self.target_scalers:
            # ▼▼▼ 重要な変更点：ラベルをスケーリング ▼▼▼
            scaled_labels = np.full_like(labels, np.nan, dtype=np.float32)
            for i, target in enumerate(CFG.targets):
                if not np.isnan(labels[i]):
                    scaled_labels[i] = self.target_scalers[target].transform(labels[i].reshape(1, 1))[0, 0]
            return graph, torch.tensor(scaled_labels, dtype=torch.float)
        else:
            return graph, torch.tensor(labels, dtype=torch.float)

# ===================================================================
# 損失関数と学習・評価ループ
# ===================================================================
class MaskedMAELoss(nn.Module):
    """NaNを無視してMAEを計算するカスタム損失関数"""
    def __init__(self):
        super(MaskedMAELoss, self).__init__()
        self.mae = nn.L1Loss(reduction='none')

    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
        mask = ~torch.isnan(y_true)
        loss = self.mae(y_pred[mask], y_true[mask])
        return loss.mean()

def train_fn(model, dataloader, optimizer, criterion, device):
    """1エポック分の学習を行う関数"""
    model.train()
    total_loss = 0
    for data, labels in tqdm(dataloader, desc="Training"):
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def eval_fn(model, dataloader, criterion, device, target_scalers):
    """評価関数をスケーリング対応に修正"""
    model.eval()
    total_loss = 0
    all_preds_inversed = []
    all_labels_original = []

    with torch.no_grad():
        for data, scaled_labels in dataloader:
            data, scaled_labels = data.to(device), scaled_labels.to(device)

            # モデルはスケーリングされた値を予測
            scaled_preds = model(data)
            loss = criterion(scaled_preds, scaled_labels)
            total_loss += loss.item()

            # ▼▼▼ 重要な変更点：予測値を元のスケールに戻す ▼▼▼
            scaled_preds_cpu = scaled_preds.cpu().numpy()
            inversed_preds = np.full_like(scaled_preds_cpu, np.nan)

            for i, target in enumerate(CFG.targets):
                # NaNでない予測値のみを逆変換
                valid_mask = ~np.isnan(scaled_preds_cpu[:, i])
                if np.any(valid_mask):
                    inversed_preds[valid_mask, i] = target_scalers[target].inverse_transform(scaled_preds_cpu[valid_mask, i].reshape(-1, 1)).flatten()

            all_preds_inversed.append(inversed_preds)
            all_labels_original.append(dataloader.dataset.labels_original[[d.num_nodes for d in data.to('cpu').to_data_list()]])

    # 逆変換した予測値でMAEを計算
    all_preds_inversed = np.concatenate(all_preds_inversed)
    all_labels_original = np.concatenate(all_labels_original)

    final_scores = {}
    for i, target in enumerate(CFG.targets):
        mask = ~np.isnan(all_labels_original[:, i])
        final_scores[target] = mean_absolute_error(all_labels_original[mask, i], all_preds_inversed[mask, i])

    return total_loss / len(dataloader), final_scores

In [None]:
train_val_df = load_data(CFG) # 追加データを含む全学習データ

kf = KFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
oof_preds = pd.DataFrame(index=train_val_df.index, columns=CFG.targets)

# Create the output directory if it doesn't exist
os.makedirs(CFG.output_dir, exist_ok=True)

for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_df)):
    print(f"========== Fold {fold+1}/{CFG.n_splits} ==========")
    train_fold_df = train_val_df.iloc[train_idx]
    valid_fold_df = train_val_df.iloc[val_idx]

    # --- ターゲットのスケーラーを学習データで作成・保存 ---
    target_scalers = {}
    for target in CFG.targets:
        scaler = StandardScaler()
        valid_targets = train_fold_df[target].dropna().values.reshape(-1, 1)
        scaler.fit(valid_targets)
        target_scalers[target] = scaler

    scaler_path = os.path.join(CFG.output_dir, f"gnn_target_scalers_fold_{fold}.joblib")
    joblib.dump(target_scalers, scaler_path)
    print(f"Target scalers for fold {fold} saved to {scaler_path}")

    # --- DatasetとDataLoaderの準備 ---
    train_dataset = PolymerDataset(train_fold_df, target_scalers)
    valid_dataset = PolymerDataset(valid_fold_df, target_scalers)
    train_loader = PyGDataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
    valid_loader = PyGDataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False)

    # --- モデル、損失関数、オプティマイザの定義 ---
    model = PolymerGNN().to(CFG.device)
    criterion = MaskedMAELoss()
    optimizer = optim.Adam(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)

    # --- 学習ループ ---
    best_val_score = float('inf')
    patience_counter = 0
    model_path = os.path.join(CFG.output_dir, f"best_gnn_model_fold_{fold}.pth")

    for epoch in range(CFG.epochs):
        train_loss = train_fn(model, train_loader, optimizer, criterion, CFG.device)
        val_loss, val_scores = eval_fn(model, valid_loader, criterion, CFG.device, target_scalers)

        avg_val_score = np.mean(list(val_scores.values()))
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Valid MAE: {avg_val_score:.4f}")

        if avg_val_score < best_val_score:
            best_val_score = avg_val_score
            torch.save(model.state_dict(), model_path)
            print(f"⭐️ Best model saved with score: {best_val_score:.4f}")
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= CFG.patience:
            print("Early stopping triggered.")
            break

print("GNNモデルのCV学習が完了しました。🎉")

In [None]:
# ===================================================================
# OOF予測とテスト予測の保存
# ===================================================================

# OOF予測の保存
oof_preds_path = os.path.join(CFG.output_dir, "gnn_oof_predictions.csv")
oof_preds.to_csv(oof_preds_path, index=False)
print(f"✅ OOF predictions saved to: {oof_preds_path}")

# テスト予測の保存 (Placeholder - テストデータ読み込みと予測コードは別途必要)
test_preds_path = os.path.join(CFG.output_dir, "gnn_test_predictions.csv")
test_preds_df.to_csv(test_preds_path, index=False)
print(f"✅ Test predictions saved to: {test_preds_path}")

In [None]:
import json

# (CV学習ループが完了した後...)

print("Calculating final OOF CV scores...")

# 最終的なCVスコアを格納する辞書
final_cv_scores = {}

# 各ターゲットのOOFスコアを計算
for target in CFG.targets:
    # 正解ラベルとOOF予測からNaNを除外して整合性を取る
    y_true = train_val_df[target].dropna()
    y_pred = oof_preds[target].loc[y_true.index].dropna()

    score = mean_absolute_error(y_true, y_pred)
    final_cv_scores[target] = score
    print(f"Final OOF Score for {target}: {score:.5f}")

# 全ターゲットの平均スコアを計算
average_score = np.mean(list(final_cv_scores.values()))
final_cv_scores['average_cv_score'] = average_score
print(f"Average OOF CV Score: {average_score:.5f}")

# スコアをJSONファイルに保存
scores_path = os.path.join(CFG.output_dir, "gnn_cv_scores.json")
with open(scores_path, 'w') as f:
    json.dump(final_cv_scores, f, indent=4)

print(f"✅ Final CV scores saved to: {scores_path}")