In [None]:
pip install -q transformers datasets sentencepiece accelerate bitsandbytes peft scikit-learn

In [None]:
import pandas as pd, torch, numpy as np
from transformers import (AutoTokenizer, AutoModel,
                          TrainingArguments, Trainer,
                          DataCollatorWithPadding)
from transformers.modeling_outputs import SequenceClassifierOutput
import sklearn.metrics as skm

In [None]:
MAX_SAMPLES = 100000
SEED=42
TRAITS = ["O", "C", "E", "A", "N"]
RAW_COL = "text"
MAX_LEN = 256
BATCH_SIZE = 15

In [1]:
def load_pandora_binary_data():
    print("Loading Pandora dataset...")

    splits = {
        'train': 'data/train-00001-of-00002.parquet',
        'validation': 'data/validation-00000-of-00001.parquet',
        'test': 'data/test-00000-of-00001.parquet'
    }

    train_df = pd.read_parquet('hf://datasets/jingjietan/pandora-big5/' + splits['train'])
    if len(train_df) > MAX_SAMPLES:
        train_df = train_df.sample(MAX_SAMPLES, random_state=SEED)

    val_df = pd.read_parquet('hf://datasets/jingjietan/pandora-big5/' + splits['validation'])
    if len(val_df) > MAX_SAMPLES // 5:
        val_df = val_df.sample(MAX_SAMPLES // 5, random_state=SEED)

    test_df = pd.read_parquet('hf://datasets/jingjietan/pandora-big5/' + splits['test'])
    if len(test_df) > MAX_SAMPLES // 5:
        test_df = test_df.sample(MAX_SAMPLES // 5, random_state=SEED)

    print(f"Dataset sizes: Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

    thresholds = {}
    for trait in TRAITS:
        thresholds[trait] = train_df[trait].median()
        train_df[trait] = (train_df[trait] >= thresholds[trait]).astype(int)
        val_df[trait] = (val_df[trait] >= thresholds[trait]).astype(int)
        test_df[trait] = (test_df[trait] >= thresholds[trait]).astype(int)
        print(f"Median threshold for {trait}: {thresholds[trait]:.2f}")

    print("Class distribution:")
    for trait in TRAITS:
        train_pos = train_df[trait].mean() * 100
        val_pos = val_df[trait].mean() * 100
        test_pos = test_df[trait].mean() * 100
        print(f"{trait}: Train {train_pos:.1f}% positive, Val {val_pos:.1f}% positive, Test {test_pos:.1f}% positive")

    return train_df, val_df, test_df

In [None]:
train_df, val_df, test_df = load_pandora_binary_data()

Loading Pandora dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset sizes: Train: 10000, Val: 2000, Test: 2000
Median threshold for O: 38.00
Median threshold for C: 68.00
Median threshold for E: 31.00
Median threshold for A: 33.00
Median threshold for N: 50.00

Class distribution:
O: Train 51.1% positive, Val 51.8% positive, Test 51.6% positive
C: Train 50.3% positive, Val 50.1% positive, Test 51.7% positive
E: Train 50.2% positive, Val 50.7% positive, Test 51.3% positive
A: Train 50.1% positive, Val 51.2% positive, Test 52.0% positive
N: Train 52.3% positive, Val 52.5% positive, Test 53.9% positive


In [None]:
class PersonalityDataset(torch.utils.data.Dataset):
    def __init__(self, df, tok, max_len=512):
        self.enc = tok(
            list(df["text"]),
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )
        self.labels = torch.tensor(
            df[traits].to_numpy(dtype=np.float32),
            dtype=torch.float32
        )

    def __len__(self):  return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.enc.items()}
        item["labels"] = self.labels[idx]
        return item

In [None]:
class BertBig5(torch.nn.Module):
    def __init__(self, mdl, pos_w):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(mdl)
        h = self.backbone.config.hidden_size
        self.cls = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(h, h),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(h, 5)
        )
        self.crit = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w)
    def forward(self, input_ids, attention_mask, labels=None):
        x = self.backbone(input_ids, attention_mask).last_hidden_state[:,0]
        logits = self.cls(x)
        loss = self.crit(logits, labels) if labels is not None else None
        return SequenceClassifierOutput(loss=loss, logits=logits)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    if not hasattr(compute_metrics, "thr"):
        thr = []
        for j in range(5):
            best = max((skm.f1_score(labels[:,j], (probs[:,j]>t)),
                        t) for t in np.linspace(0.1,0.9,17))[1]
            thr.append(best)
        compute_metrics.thr = np.array(thr)
    pred = (probs > compute_metrics.thr).astype(int)
    return {"f1_macro": skm.f1_score(labels, pred, average="macro"),
            "f1_micro": skm.f1_score(labels, pred, average="micro"),
            "accuracy":  skm.accuracy_score(labels, pred)}

In [None]:
traits = ["O", "C", "E", "A", "N"]

def enforce_numeric(df):
    for t in traits:
        df[t] = (
            pd.to_numeric(df[t], errors="coerce")
              .fillna(0)
              .astype(int)
        )
    return df

train_df = enforce_numeric(train_df)
val_df   = enforce_numeric(val_df)
test_df  = enforce_numeric(test_df)

In [None]:
mdl_name   = "bert-base-uncased"
tok        = AutoTokenizer.from_pretrained(mdl_name)
traits     = ["O","C","E","A","N"]

train_ds = PersonalityDataset(train_df, tok)
val_ds   = PersonalityDataset(val_df, tok)
test_ds  = PersonalityDataset(test_df, tok)

pos_w = torch.tensor([(len(train_df)-train_df[t].sum())/train_df[t].sum() for t in traits])

In [None]:
args = TrainingArguments(
    "./big5", eval_strategy="epoch", save_strategy="epoch",
    metric_for_best_model="f1_macro", load_best_model_at_end=True,
    num_train_epochs=5, learning_rate=2e-5, weight_decay=0.01,
    per_device_train_batch_size=16, gradient_accumulation_steps=2,
    fp16=torch.cuda.is_available(), warmup_ratio=0.06, logging_steps=50)

trainer = Trainer(
    model         = BertBig5(mdl_name, pos_w),
    args          = args,
    train_dataset = train_ds,
    eval_dataset  = val_ds,
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train();      # ~40 min on a T4

[34m[1mwandb[0m: Currently logged in as: [33mposchyokotov[0m ([33mposchyokotov-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy
1,0.6794,0.676662,0.678958,0.678946,0.069
2,0.6636,0.672133,0.666522,0.666959,0.0645
3,0.6297,0.681125,0.640565,0.641946,0.061
4,0.5626,0.710965,0.623693,0.624509,0.069


In [None]:
print(trainer.evaluate(test_ds))

{'eval_loss': 0.6764523983001709, 'eval_f1_macro': 0.6865878514338156, 'eval_f1_micro': 0.6865651798178555, 'eval_accuracy': 0.0785, 'eval_runtime': 14.2461, 'eval_samples_per_second': 140.389, 'eval_steps_per_second': 17.549, 'epoch': 4.9856}


In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

predictions = trainer.predict(test_ds)
preds = predictions.predictions

In [None]:
probs = torch.sigmoid(torch.tensor(preds))

threshold = 0.45
pred_labels = (probs > threshold).int().numpy()


true_labels = np.stack(
    [test_df['O'], test_df['C'], test_df['E'], test_df['A'], test_df['N']],
    axis=1
)

acc = accuracy_score(true_labels, pred_labels)
f1_macro = f1_score(true_labels, pred_labels, average='macro')
f1_micro = f1_score(true_labels, pred_labels, average='micro')

print(f"Accuracy:   {acc:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")
print(f"F1 (micro): {f1_micro:.4f}")

Accuracy:   0.0885
F1 (macro): 0.6644
F1 (micro): 0.6660


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

traits = ["O", "C", "E", "A", "N"]

# ----- per-trait accuracy -------------------------------------------------------
trait_acc = {
    t: accuracy_score(true_labels[:, j], pred_labels[:, j])
    for j, t in enumerate(traits)
}

print("\nPer-trait accuracy")
for t, a in trait_acc.items():
    print(f"  {t}: {a:.3f}")

# ----- (optional) full per-trait report ----------------------------------------
prec, rec, f1, _ = precision_recall_fscore_support(
    true_labels, pred_labels, average=None)

print("Per-trait precision / recall / F1")
for j, t in enumerate(traits):
    print(f"  {t}: P={prec[j]:.3f}  R={rec[j]:.3f}  F1={f1[j]:.3f}")



Per-trait accuracy
  O: 0.525
  C: 0.526
  E: 0.523
  A: 0.532
  N: 0.568

Per-trait precision / recall / F1
  O: P=0.524  R=0.870  F1=0.654
  C: P=0.523  R=0.940  F1=0.672
  E: P=0.518  R=0.983  F1=0.679
  A: P=0.527  R=0.979  F1=0.685
  N: P=0.584  R=0.687  F1=0.632
