 # NeurIPS 2025 – Open Polymer Prediction

 ### ChemBERTa baseline (multi-task, masked wMAE)

 ## 0  Environment & installs

 On Kaggle you normally pre-install `transformers`, but

 include the `pip` line for local / VS Code runs.

In [1]:
!pip install -q transformers

 ## 1  Imports & config

In [2]:
import os, json, random, math, re, gc, warnings, pathlib
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    AdamW,
    get_cosine_schedule_with_warmup,
)

warnings.filterwarnings("ignore")

SEED          = 42
MODEL_NAME    = "seyonec/ChemBERTa-zinc-base-v1"
BATCH_SIZE    = 16          # fits comfortably on Kaggle T4/V100
MAX_LEN       = 128
EPOCHS        = 4
LR            = 2e-5
WARMUP_RATIO  = 0.1
OUTPUT_DIR    = Path("./checkpoints")
OUTPUT_DIR.mkdir(exist_ok=True)

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


  from .autonotebook import tqdm as notebook_tqdm


Device: cpu


 ## 2  Utility – weighted-MAE helper

 Compute the task weights \\(w_i\\) **once** from the available

 (public) training labels.

 Strictly the organisers use the **test-set** ranges, but using

 train-set statistics is the standard offline proxy.

In [3]:
TARGETS = ["Tg", "FFV", "Tc", "Density", "Rg"]

def compute_task_weights(df: pd.DataFrame, eps: float = 1e-8) -> torch.Tensor:
    ranges = df[TARGETS].max() - df[TARGETS].min()
    n_i    = df[TARGETS].notna().sum()
    K      = len(TARGETS)
    root_inv_n = np.sqrt(1.0 / (n_i + eps))
    weights = (1.0 / (ranges + eps)) * (K * root_inv_n / root_inv_n.sum())
    return torch.tensor(weights.values, dtype=torch.float32)


 ## 3  Load & clean data

 *Remove only rows with completely missing SMILES; keep all NaNs in targets.*

In [4]:
DATA_DIR = Path("/Users/maxhart/Documents/AI_AND_ML/NeuralIPS-Polymer/neurips-open-polymer-prediction-2025")  # adjust locally
train_csv = DATA_DIR / "train.csv"
test_csv  = DATA_DIR / "test.csv"

df_train = pd.read_csv(train_csv)
df_test  = pd.read_csv(test_csv)

# basic SMILES clean-up – ChemBERTa cannot handle '*'
def clean_smiles(s: str) -> str:
    s = re.sub(r"N\*", "N", s)
    s = re.sub(r"O\*", "O", s)
    s = s.replace("*", "")
    return s

df_train = df_train[df_train["SMILES"].notna()].copy()
df_train["SMILES"] = df_train["SMILES"].str.strip().apply(clean_smiles)
df_train[TARGETS]  = df_train[TARGETS].apply(pd.to_numeric, errors="coerce")

df_test["SMILES"]  = df_test["SMILES"].str.strip().apply(clean_smiles)

print(f"Train shape  : {df_train.shape}")
print(f"Test  shape  : {df_test.shape}")


Train shape  : (7973, 7)
Test  shape  : (3, 2)


 ## 4  Tokeniser (ChemBERTa)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def encode_smiles(smiles_list):
    return tokenizer(
        smiles_list,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt",
    )

# pre-encode full datasets – keeps dataloader lightweight
train_enc = encode_smiles(df_train["SMILES"].tolist())
test_enc  = encode_smiles(df_test["SMILES"].tolist())


 ## 5  Dataset & DataLoader with masked labels

In [6]:
class PolymerDataset(Dataset):
    def __init__(self, encodings, target_df=None):
        self.encodings = encodings
        if target_df is not None:
            self.targets = torch.tensor(target_df[TARGETS].values, dtype=torch.float32)
        else:
            self.targets = None

    def __len__(self):
        return self.encodings["input_ids"].shape[0]

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        if self.targets is not None:
            item["labels"] = self.targets[idx]
        return item

full_ds = PolymerDataset(train_enc, df_train)
test_ds = PolymerDataset(test_enc)

# simple random 90 / 10 split for validation
val_fraction = 0.1
val_size     = int(len(full_ds) * val_fraction)
train_size   = len(full_ds) - val_size
train_ds, val_ds = torch.utils.data.random_split(full_ds, [train_size, val_size],
                                                 generator=torch.Generator().manual_seed(SEED))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)


 ## 6  Model – ChemBERTa backbone + 5-head regressor

In [7]:
class ChemBERTaRegressor(nn.Module):
    def __init__(self, model_name: str, n_targets: int = 5, hidden: int = 128):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.reg  = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, hidden),
            nn.ReLU(),
            nn.Linear(hidden, n_targets),
        )

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        return self.reg(x)

model   = ChemBERTaRegressor(MODEL_NAME, n_targets=len(TARGETS)).to(device)
weights = compute_task_weights(df_train).to(device)
print("Task weights w_i:", weights.cpu().numpy().round(4))


Task weights w_i: [2.1000e-03 6.2390e-01 2.2200e+00 1.0641e+00 4.6600e-02]


 ## 7  Masked-wMAE loss + metric helpers

In [8]:
def wmae_loss(outputs, targets, w):
    mask = ~torch.isnan(targets)
    abs_diff = torch.abs(outputs - targets)
    weighted = abs_diff * w
    return weighted[mask].mean()

@torch.no_grad()
def evaluate(loader):
    model.eval()
    tot, cnt = 0.0, 0
    for batch in loader:
        outs = model(batch["input_ids"].to(device),
                     batch["attention_mask"].to(device))
        loss = wmae_loss(outs, batch["labels"].to(device), weights)
        tot += loss.item() * len(batch["input_ids"])
        cnt += len(batch["input_ids"])
    return tot / cnt


 ## 8  Training loop

In [9]:
optim   = AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
steps   = len(train_loader) * EPOCHS
sched   = get_cosine_schedule_with_warmup(
    optim,
    num_warmup_steps=int(steps * WARMUP_RATIO),
    num_training_steps=steps,
)

best_val = math.inf
for epoch in range(1, EPOCHS + 1):
    model.train()
    running = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}", leave=False):
        optim.zero_grad()
        outs = model(batch["input_ids"].to(device),
                     batch["attention_mask"].to(device))
        loss = wmae_loss(outs, batch["labels"].to(device), weights)
        loss.backward()
        optim.step()
        sched.step()
        running += loss.item()
    val_wmae = evaluate(val_loader)
    print(f"Epoch {epoch:02d} | train wMAE ≈ {running/len(train_loader):.4f} |"
          f" val wMAE = {val_wmae:.4f}")
    if val_wmae < best_val:
        best_val = val_wmae
        torch.save(model.state_dict(), OUTPUT_DIR / "best.pt")


Epoch 1:   0%|          | 0/448 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
                                                          

Epoch 01 | train wMAE ≈ 0.1304 | val wMAE = 0.0865


                                                          

Epoch 02 | train wMAE ≈ 0.0853 | val wMAE = 0.0774


                                                          

Epoch 03 | train wMAE ≈ 0.0777 | val wMAE = 0.0752


                                                          

Epoch 04 | train wMAE ≈ 0.0753 | val wMAE = 0.0734


 ## 9  Inference on test set & submission

In [10]:
model.load_state_dict(torch.load(OUTPUT_DIR / "best.pt"))
model.eval()

preds = []
with torch.no_grad():
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)
    for batch in tqdm(test_loader, desc="Infer"):
        out = model(batch["input_ids"].to(device),
                    batch["attention_mask"].to(device))
        preds.append(out.cpu().numpy())
preds = np.vstack(preds)

sub = pd.DataFrame(preds, columns=TARGETS)
sub.insert(0, "id", df_test["id"])
sub.to_csv("submission.csv", index=False)
sub.head()


Infer: 100%|██████████| 1/1 [00:00<00:00, 24.27it/s]


Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,0.077876,0.377792,0.219466,1.216206,1.405315
1,1422188626,-0.005921,0.382061,0.244338,1.182076,1.396989
2,2032016830,0.056237,0.37709,0.291676,1.115605,1.303566
