In [1]:
!pip install optuna



In [2]:
import os, random, gc
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AutoConfig
from tqdm import tqdm

import optuna

import warnings
warnings.filterwarnings('ignore')

In [3]:
COMP_PATH = Path("/kaggle/input/deep-learning-for-computer-vision-and-nlp-2026-01")
FOLDS_PATH = Path("/kaggle/input/petfinder-train-folds/train_folds.csv") 

train_csv = COMP_PATH / "train.csv"
test_csv = COMP_PATH / "test.csv"

print("train exists:", train_csv.exists())
print("test exists :", test_csv.exists())
print("folds exists:", FOLDS_PATH.exists())

train exists: True
test exists : True
folds exists: True


In [4]:
train_df = pd.read_csv(train_csv)
test_df  = pd.read_csv(test_csv)
folds_df = pd.read_csv(FOLDS_PATH)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Folds shape:", folds_df.shape)

Train shape: (6431, 3)
Test shape : (1891, 2)
Folds shape: (6431, 3)


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6431 entries, 0 to 6430
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   PetID          6431 non-null   object
 1   Description    6426 non-null   object
 2   AdoptionSpeed  6431 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 150.9+ KB


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1891 entries, 0 to 1890
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PetID        1891 non-null   object
 1   Description  1890 non-null   object
dtypes: object(2)
memory usage: 29.7+ KB


In [7]:
train_df["Description"] = train_df["Description"].fillna("")
test_df["Description"] = test_df["Description"].fillna("")

In [8]:
print(train_df["AdoptionSpeed"].value_counts().sort_index())

AdoptionSpeed
1    1197
2    1773
3    1328
4    2133
Name: count, dtype: int64


In [None]:
print("train dataset:")
print(train_df.head(), "\n")
print("test dataset:")
print(test_df.head(), "\n")
print("folds dataset:")
print(folds_df.head())

train dataset:
       PetID                                        Description  AdoptionSpeed
0  d3b4f29f8  Mayleen and Flo are two lovely adorable sister...              2
1  e9dc82251  A total of 5 beautiful Tabbys available for ad...              2
2  8111f6d4a  Two-and-a-half month old girl. Very manja and ...              2
3  693a90fda  Neil is a healthy and active ~2-month-old fema...              2
4  9d08c85ef  Gray kitten available for adoption in sungai p...              2 

test dataset:
       PetID                                        Description
0  6697a7f62  This cute little puppy is looking for a loving...
1  23b64fe21  These 3 puppies was rescued from a mechanic sh...
2  41e824cbe  Ara needs a forever home! Believe me, he's a r...
3  6c3d7237b  i rescue this homeless dog 2 years ago but my ...
4  97b0b5d92  We found him at a shopping mall at a very clea... 

folds dataset:
       PetID  AdoptionSpeed  fold
0  d3b4f29f8              2     3
1  e9dc82251              

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
model_name = "bert-base-cased"
num_classes = 4

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

E0000 00:00:1769278382.170015      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769278382.228060      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769278382.690905      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769278382.690946      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769278382.690948      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769278382.690950      55 computation_placer.cc:177] computation placer already registered. Please check linka

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [12]:
text = train_df.loc[0, "Description"]  
inputs = tokenizer(text)
tokenizer.decode(inputs["input_ids"])

'[CLS] Mayleen and Flo are two lovely adorable sisters. They are very friendly and affectionate, but wary of strangers and make good watchdogs. Mayleen has golden hues on her face, making her a husky look - alike. Flo has a darker face with brown feet, and is the more outgoing and dominat of the two. Looking for good homes. Adopters must vaccinate and spay them. [SEP]'

In [13]:
inputs = tokenizer( 
    text, 
    max_length=200, 
    truncation=True,
    return_overflowing_tokens=True, 
) 
  
for ids in inputs["input_ids"]: 
    print(tokenizer.decode(ids)); print()

[CLS] Mayleen and Flo are two lovely adorable sisters. They are very friendly and affectionate, but wary of strangers and make good watchdogs. Mayleen has golden hues on her face, making her a husky look - alike. Flo has a darker face with brown feet, and is the more outgoing and dominat of the two. Looking for good homes. Adopters must vaccinate and spay them. [SEP]



In [14]:
inputs = tokenizer( 
    text, 
    max_length=200, 
    truncation=True, 
    return_overflowing_tokens=True, 
    return_offsets_mapping=True, 
 ) 
inputs.keys()
print(inputs.keys())
print("num chunks:", len(inputs["input_ids"]))

KeysView({'input_ids': [[101, 1318, 21180, 1105, 143, 2858, 1132, 1160, 9020, 27627, 5919, 119, 1220, 1132, 1304, 4931, 1105, 12721, 2193, 117, 1133, 16970, 1104, 15712, 1105, 1294, 1363, 2824, 14082, 1116, 119, 1318, 21180, 1144, 5404, 177, 10589, 1113, 1123, 1339, 117, 1543, 1123, 170, 24418, 1440, 118, 11609, 119, 143, 2858, 1144, 170, 9934, 1339, 1114, 3058, 1623, 117, 1105, 1110, 1103, 1167, 25194, 1105, 1202, 14503, 1204, 1104, 1103, 1160, 119, 8540, 1111, 1363, 4481, 119, 24930, 4184, 5759, 1538, 191, 7409, 16430, 2193, 1105, 22620, 1183, 1172, 119, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [15]:
lengths = train_df["Description"].fillna("").astype(str).apply(
    lambda x: len(tokenizer(x, add_special_tokens=True, truncation=False)["input_ids"])
)

print("count:", lengths.shape[0])
print("min  :", lengths.min())
print("p50  :", int(lengths.quantile(0.50)))
print("p90  :", int(lengths.quantile(0.90)))
print("p95  :", int(lengths.quantile(0.95)))
print("p99  :", int(lengths.quantile(0.99)))
print("max  :", lengths.max())

Token indices sequence length is longer than the specified maximum sequence length for this model (1173 > 512). Running this sequence through the model will result in indexing errors


count: 6431
min  : 2
p50  : 66
p90  : 199
p95  : 287
p99  : 557
max  : 1487


In [16]:
class PetFinderDataset(Dataset):
    def __init__(self, texts, labels=None, max_length=287):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
       
    def __len__(self):
        return len(self.texts)
   
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
       
        item = {key: val.squeeze(0) for key, val in inputs.items()}
       
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
       
        return item

In [17]:
batch_size = 16
max_length = 288
n_folds = 5
num_workers = 0 
epochs = 4
lr = 2e-5
weight_decay=0.01
pin_memory = False

In [18]:
master_df = train_df.merge(
    folds_df[["PetID", "fold"]],
    on="PetID",
    how="left"
)
print("shape:", master_df.shape)
print("missing fold:", master_df["fold"].isna().sum())

shape: (6431, 4)
missing fold: 0


In [19]:
master_df["label"] = master_df["AdoptionSpeed"] - 1  # Convert to 0-3
print(master_df["AdoptionSpeed"].min(), master_df["AdoptionSpeed"].max())
print(master_df["label"].min(), master_df["label"].max())

1 4
0 3


In [20]:
for fold in range(n_folds):
    print("№ Fold:", fold)

    train = master_df[master_df["fold"] != fold]
    val = master_df[master_df["fold"] == fold]

    train_texts = train["Description"].values
    train_labels = train["label"].values

    val_texts = val["Description"].values
    val_labels = val["label"].values

    train_dataset = PetFinderDataset(
        texts=train_texts,
        labels=train_labels,
        max_length=max_length
    )

    val_dataset = PetFinderDataset(
        texts=val_texts,
        labels=val_labels,
        max_length=max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    print("Train batches:", len(train_loader))
    print(" Val batches:", len(val_loader), "\n")

№ Fold: 0
Train batches: 322
 Val batches: 81 

№ Fold: 1
Train batches: 322
 Val batches: 81 

№ Fold: 2
Train batches: 322
 Val batches: 81 

№ Fold: 3
Train batches: 322
 Val batches: 81 

№ Fold: 4
Train batches: 322
 Val batches: 81 



In [21]:
batch = next(iter(train_loader))

for k, v in batch.items():
    print(k, v.shape, v.dtype)

input_ids torch.Size([16, 288]) torch.int64
token_type_ids torch.Size([16, 288]) torch.int64
attention_mask torch.Size([16, 288]) torch.int64
labels torch.Size([16]) torch.int64


In [22]:
print("labels min/max:", batch["labels"].min().item(), batch["labels"].max().item())
print("unique (first 50):", torch.unique(batch["labels"])[:50])

labels min/max: 0 3
unique (first 50): tensor([0, 1, 2, 3])


In [23]:
model.train()
optimizer = AdamW(model.parameters(), lr=lr)

batch = next(iter(train_loader))
outputs = model(
    input_ids=batch["input_ids"].to(device),
    attention_mask=batch["attention_mask"].to(device),
    labels=batch["labels"].to(device),
)
loss = outputs.loss

optimizer.zero_grad()
loss.backward()
optimizer.step()

print("train step loss:", loss.item())

train step loss: 1.3669534921646118


In [24]:
model.eval()

with torch.no_grad():
    outputs = model(
        input_ids=batch["input_ids"].to(device),
        attention_mask=batch["attention_mask"].to(device),
        labels=batch["labels"].to(device),
    )

print("loss:", outputs.loss)
print("logits shape:", outputs.logits.shape)


loss: tensor(1.2724, device='cuda:0')
logits shape: torch.Size([16, 4])


In [25]:
def train_one_epoch(model, train_loader, optimizer, device, scheduler=None):
    pbar = tqdm(train_loader, desc="Training")
    model.train()
    total_loss = 0.0

    for batch in pbar:
        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            labels=batch["labels"].to(device),
        )
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        if scheduler:
            scheduler.step()
        optimizer.step()
        total_loss += loss.item()
        avg_so_far = total_loss / (pbar.n + 1)
        pbar.set_postfix(loss=f"{loss.item():.4f}", avg=f"{avg_so_far:.4f}")
    avg_loss = total_loss / len(train_loader)
    return avg_loss

In [26]:
@torch.no_grad()
def validate_one_epoch(model, val_loader, device):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    pbar = tqdm(val_loader, desc="Validating")

    with torch.no_grad():
        for batch in pbar:
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                labels=batch["labels"].to(device),
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()   # 0..3
            labels = batch["labels"].cpu().numpy()              # 0..3

            all_preds.extend(preds)
            all_labels.extend(labels)

            avg_so_far = total_loss / (pbar.n + 1)
            pbar.set_postfix(avg_loss=f"{avg_so_far:.4f}")

    avg_loss = total_loss / len(val_loader)

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    qwk = cohen_kappa_score(all_labels + 1, all_preds + 1, weights="quadratic")

    return avg_loss, qwk

In [None]:
WEIGHTS_DIR = Path("/kaggle/working/artifacts")
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)

for fold in range(n_folds):
    print(f"\n===== FOLD {fold} =====")

    train = master_df[master_df["fold"] != fold].reset_index(drop=True)
    val   = master_df[master_df["fold"] == fold].reset_index(drop=True)

    train_texts  = train["Description"].values
    train_labels = train["label"].values

    val_texts  = val["Description"].values
    val_labels = val["label"].values

    train_dataset = PetFinderDataset(train_texts, train_labels, max_length=max_length)
    val_dataset   = PetFinderDataset(val_texts,   val_labels,   max_length=max_length)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory,
    )
    print("Train batches:", len(train_loader))
    print(" Val batches:", len(val_loader))
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=4).to(device)
    optimizer = AdamW(model.parameters(), lr=lr,weight_decay=weight_decay)

    best_qwk = -1e9
    best_path = f"{WEIGHTS_DIR}/nlp_{model_name}_fold{fold}.pth"

    # 6) epochs loop
    for epoch in range(epochs): #, epochs + 1
        print(f"\n--- Epoch {epoch}/{epochs} ---")

        train_loss = train_one_epoch(model, train_loader, optimizer, device)  
        val_loss, val_qwk = validate_one_epoch(model, val_loader, device)    

        print(f"train_loss: {train_loss:.4f} | val_loss: {val_loss:.4f} | val_QWK: {val_qwk:.4f}")

        # 7) save best weights
        if val_qwk > best_qwk:
            best_qwk = val_qwk
            torch.save(model.state_dict(), best_path)
            print(f"✅ Saved best: {best_path} | best_QWK={best_qwk:.4f}")

    print(f"\nFOLD {fold} DONE | best_QWK={best_qwk:.4f}")


===== FOLD 0 =====
Train batches: 322
 Val batches: 81


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Epoch 0/6 ---


Training: 100%|██████████| 322/322 [02:29<00:00,  2.16it/s, avg=1.3548, loss=1.1107]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.84it/s, avg_loss=1.3244]


train_loss: 1.3548 | val_loss: 1.3244 | val_QWK: 0.0784
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold0.pth | best_QWK=0.0784

--- Epoch 1/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.3065, loss=1.3584]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.3091]


train_loss: 1.3065 | val_loss: 1.3091 | val_QWK: 0.2506
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold0.pth | best_QWK=0.2506

--- Epoch 2/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.13it/s, avg=1.2053, loss=1.3368]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=1.3276]


train_loss: 1.2053 | val_loss: 1.3276 | val_QWK: 0.2663
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold0.pth | best_QWK=0.2663

--- Epoch 3/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.0063, loss=0.9027]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.5226]


train_loss: 1.0063 | val_loss: 1.5226 | val_QWK: 0.2572

--- Epoch 4/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.7202, loss=0.6113]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=1.6085]


train_loss: 0.7202 | val_loss: 1.6085 | val_QWK: 0.2893
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold0.pth | best_QWK=0.2893

--- Epoch 5/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.4398, loss=0.4131]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=2.1877]


train_loss: 0.4398 | val_loss: 2.1877 | val_QWK: 0.2478

FOLD 0 DONE | best_QWK=0.2893

===== FOLD 1 =====
Train batches: 322
 Val batches: 81


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Epoch 0/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.3519, loss=1.3951]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.84it/s, avg_loss=1.2973]


train_loss: 1.3519 | val_loss: 1.2973 | val_QWK: 0.1381
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold1.pth | best_QWK=0.1381

--- Epoch 1/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.2733, loss=1.2343]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=1.2821]


train_loss: 1.2733 | val_loss: 1.2821 | val_QWK: 0.2009
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold1.pth | best_QWK=0.2009

--- Epoch 2/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.1354, loss=1.1807]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=1.3134]


train_loss: 1.1354 | val_loss: 1.3134 | val_QWK: 0.2804
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold1.pth | best_QWK=0.2804

--- Epoch 3/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.8508, loss=1.2345]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.84it/s, avg_loss=1.4734]


train_loss: 0.8508 | val_loss: 1.4734 | val_QWK: 0.2917
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold1.pth | best_QWK=0.2917

--- Epoch 4/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.5571, loss=0.6466]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.84it/s, avg_loss=1.9340]


train_loss: 0.5571 | val_loss: 1.9340 | val_QWK: 0.2707

--- Epoch 5/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.3499, loss=0.4673]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=2.1277]


train_loss: 0.3499 | val_loss: 2.1277 | val_QWK: 0.2492

FOLD 1 DONE | best_QWK=0.2917

===== FOLD 2 =====
Train batches: 322
 Val batches: 81


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Epoch 0/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.3484, loss=1.0790]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.3221]


train_loss: 1.3484 | val_loss: 1.3221 | val_QWK: 0.1599
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold2.pth | best_QWK=0.1599

--- Epoch 1/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.2481, loss=1.3283]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.2843]


train_loss: 1.2481 | val_loss: 1.2843 | val_QWK: 0.2456
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold2.pth | best_QWK=0.2456

--- Epoch 2/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.0809, loss=0.9560]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.3101]


train_loss: 1.0809 | val_loss: 1.3101 | val_QWK: 0.2972
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold2.pth | best_QWK=0.2972

--- Epoch 3/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.8084, loss=0.9655]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=1.4732]


train_loss: 0.8084 | val_loss: 1.4732 | val_QWK: 0.2813

--- Epoch 4/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.5361, loss=0.1739]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.8062]


train_loss: 0.5361 | val_loss: 1.8062 | val_QWK: 0.2858

--- Epoch 5/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.3165, loss=0.1095]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.81it/s, avg_loss=2.2364]


train_loss: 0.3165 | val_loss: 2.2364 | val_QWK: 0.2260

FOLD 2 DONE | best_QWK=0.2972

===== FOLD 3 =====
Train batches: 322
 Val batches: 81


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Epoch 0/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.3625, loss=1.3971]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.81it/s, avg_loss=1.3264]


train_loss: 1.3625 | val_loss: 1.3264 | val_QWK: 0.1442
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold3.pth | best_QWK=0.1442

--- Epoch 1/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.3011, loss=1.4656]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=1.2976]


train_loss: 1.3011 | val_loss: 1.2976 | val_QWK: 0.1894
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold3.pth | best_QWK=0.1894

--- Epoch 2/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.2107, loss=1.5260]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.2840]


train_loss: 1.2107 | val_loss: 1.2840 | val_QWK: 0.2603
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold3.pth | best_QWK=0.2603

--- Epoch 3/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.0292, loss=1.0734]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.81it/s, avg_loss=1.3210]


train_loss: 1.0292 | val_loss: 1.3210 | val_QWK: 0.2752
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold3.pth | best_QWK=0.2752

--- Epoch 4/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.7461, loss=0.7172]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.8774]


train_loss: 0.7461 | val_loss: 1.8774 | val_QWK: 0.2347

--- Epoch 5/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.5045, loss=0.6461]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.81it/s, avg_loss=1.7750]


train_loss: 0.5045 | val_loss: 1.7750 | val_QWK: 0.2369

FOLD 3 DONE | best_QWK=0.2752

===== FOLD 4 =====
Train batches: 322
 Val batches: 81


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Epoch 0/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.3531, loss=1.4997]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.81it/s, avg_loss=1.3193]


train_loss: 1.3531 | val_loss: 1.3193 | val_QWK: 0.1560
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold4.pth | best_QWK=0.1560

--- Epoch 1/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=1.2680, loss=0.9825]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.83it/s, avg_loss=1.3046]


train_loss: 1.2680 | val_loss: 1.3046 | val_QWK: 0.2267
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold4.pth | best_QWK=0.2267

--- Epoch 2/6 ---


Training: 100%|██████████| 322/322 [02:32<00:00,  2.12it/s, avg=1.1406, loss=0.8980]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.81it/s, avg_loss=1.3214]


train_loss: 1.1406 | val_loss: 1.3214 | val_QWK: 0.2569
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold4.pth | best_QWK=0.2569

--- Epoch 3/6 ---


Training: 100%|██████████| 322/322 [02:32<00:00,  2.12it/s, avg=0.8938, loss=1.1866]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.4228]


train_loss: 0.8938 | val_loss: 1.4228 | val_QWK: 0.2865
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold4.pth | best_QWK=0.2865

--- Epoch 4/6 ---


Training: 100%|██████████| 322/322 [02:32<00:00,  2.12it/s, avg=0.6183, loss=0.1730]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.81it/s, avg_loss=1.7500]


train_loss: 0.6183 | val_loss: 1.7500 | val_QWK: 0.2933
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold4.pth | best_QWK=0.2933

--- Epoch 5/6 ---


Training: 100%|██████████| 322/322 [02:31<00:00,  2.12it/s, avg=0.3879, loss=0.3165]
Validating: 100%|██████████| 81/81 [00:11<00:00,  6.82it/s, avg_loss=1.9869]


train_loss: 0.3879 | val_loss: 1.9869 | val_QWK: 0.3050
✅ Saved best: /kaggle/working/artifacts/nlp_bert-base-cased_fold4.pth | best_QWK=0.3050

FOLD 4 DONE | best_QWK=0.3050


In [31]:
ART = Path("/kaggle/working/artifacts")
ART.mkdir(parents=True, exist_ok=True)

In [32]:
oof = np.zeros((len(master_df), 4), dtype=np.float32)

for fold in range(n_folds):
    val = master_df[master_df.fold == fold].reset_index()
    val_dataset = PetFinderDataset(val["Description"].values, val["label"].values, max_length=max_length)
    val_loader  = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to(device)
    model.load_state_dict(torch.load(ART / f"nlp_{model_name}_fold{fold}.pth", map_location=device))
    model.eval()

    probs = []
    with torch.no_grad():
        for b in val_loader:
            out = model(input_ids=b["input_ids"].to(device), attention_mask=b["attention_mask"].to(device))
            probs.append(torch.softmax(out.logits, 1).cpu().numpy())
    oof[val["index"].values] = np.vstack(probs)

oof_df = pd.DataFrame({"PetID": master_df["PetID"].values})
for i in range(4): oof_df[f"nlp_proba_{i+1}"] = oof[:, i]
oof_df["nlp_pred"] = np.argmax(oof, 1) + 1
oof_df.to_csv(ART / "nlp_oof.csv", index=False)
print("saved:", ART / "nlp_oof.csv")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly i

saved: /kaggle/working/artifacts/nlp_oof.csv


In [34]:
test_dataset = PetFinderDataset(test_df["Description"].fillna("").values, labels=None, max_length=max_length)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

test_probs = np.zeros((len(test_df), 4), dtype=np.float32)

for fold in range(n_folds):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to(device)
    model.load_state_dict(torch.load(ART / f"nlp_{model_name}_fold{fold}.pth", map_location=device))
    model.eval()

    probs = []
    with torch.no_grad():
        for b in test_loader:
            out = model(input_ids=b["input_ids"].to(device), attention_mask=b["attention_mask"].to(device))
            probs.append(torch.softmax(out.logits, 1).cpu().numpy())
    test_probs += np.vstack(probs)

test_probs /= n_folds

test_out = pd.DataFrame({"PetID": test_df["PetID"].values})
for i in range(4): test_out[f"nlp_proba_{i+1}"] = test_probs[:, i]
test_out["nlp_pred"] = np.argmax(test_probs, 1) + 1
test_out.to_csv(ART / "nlp_test.csv", index=False)
print("saved:", ART / "nlp_test.csv")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly i

saved: /kaggle/working/artifacts/nlp_test.csv
