In [4]:
pip install -q transformers datasets sentencepiece accelerate bitsandbytes peft scikit-learn

[0m

In [5]:
import pandas as pd, torch, numpy as np
from transformers import (AutoTokenizer, AutoModel,
                          TrainingArguments, Trainer,
                          DataCollatorWithPadding)
from transformers.modeling_outputs import SequenceClassifierOutput
import sklearn.metrics as skm

In [6]:
spl_pand = {'train': 'Personality Datasets - Reddit/train_set.csv'}
spl_ess  = {'train': 'data/train-00000-of-00001.parquet'}

df_pand = pd.read_csv("hf://datasets/Fatima0923/Automated-Personality-Prediction/"+spl_pand["train"])
df_ess  = pd.read_parquet("hf://datasets/jingjietan/essays-big5/"+spl_ess["train"])

df_pand.rename(columns={'openness':'O','conscientiousness':'C',
                        'extraversion':'E','agreeableness':'A','neuroticism':'N'},inplace=True)
for t in ['O','C','E','A','N']: df_pand[t] = (df_pand[t] > 50).astype(int)

df_ess.drop(columns=['__index_level_0__','ptype'], inplace=True)
df_all = pd.concat([df_pand[['text','O','C','E','A','N']],
                    df_ess[['text','O','C','E','A','N']]]).reset_index(drop=True)

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_all, test_size=0.2, random_state=42, stratify=df_all[['O','C','E','A','N']])
train_df, val_df  = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df[['O','C','E','A','N']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
class PersonalityDataset(torch.utils.data.Dataset):
    def __init__(self, df, tok, max_len=512):
        self.enc = tok(
            list(df["text"]),
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )
        self.labels = torch.tensor(
            df[traits].to_numpy(dtype=np.float32),
            dtype=torch.float32
        )

    def __len__(self):  return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.enc.items()}
        item["labels"] = self.labels[idx]
        return item

In [7]:
class BertBig5(torch.nn.Module):
    def __init__(self, mdl, pos_w):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(mdl)
        h = self.backbone.config.hidden_size
        self.cls = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(h, h),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(h, 5)
        )
        self.crit = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w)
    def forward(self, input_ids, attention_mask, labels=None):
        x = self.backbone(input_ids, attention_mask).last_hidden_state[:,0]
        logits = self.cls(x)
        loss = self.crit(logits, labels) if labels is not None else None
        return SequenceClassifierOutput(loss=loss, logits=logits)

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    if not hasattr(compute_metrics, "thr"):
        thr = []
        for j in range(5):
            best = max((skm.f1_score(labels[:,j], (probs[:,j]>t)),
                        t) for t in np.linspace(0.1,0.9,17))[1]
            thr.append(best)
        compute_metrics.thr = np.array(thr)
    pred = (probs > compute_metrics.thr).astype(int)
    return {"f1_macro": skm.f1_score(labels, pred, average="macro"),
            "f1_micro": skm.f1_score(labels, pred, average="micro"),
            "accuracy":  skm.accuracy_score(labels, pred)}

In [14]:
traits = ["O", "C", "E", "A", "N"]

def enforce_numeric(df):
    for t in traits:
        df[t] = (
            pd.to_numeric(df[t], errors="coerce")
              .fillna(0)
              .astype(int)
        )
    return df

train_df = enforce_numeric(train_df)
val_df   = enforce_numeric(val_df)
test_df  = enforce_numeric(test_df)

In [15]:
mdl_name   = "bert-base-uncased"
tok        = AutoTokenizer.from_pretrained(mdl_name)
traits     = ["O","C","E","A","N"]

train_ds = PersonalityDataset(train_df, tok)
val_ds   = PersonalityDataset(val_df, tok)
test_ds  = PersonalityDataset(test_df, tok)

pos_w = torch.tensor([(len(train_df)-train_df[t].sum())/train_df[t].sum() for t in traits])

In [17]:
args = TrainingArguments(
    "./big5", eval_strategy="epoch", save_strategy="epoch",
    metric_for_best_model="f1_macro", load_best_model_at_end=True,
    num_train_epochs=5, learning_rate=2e-5, weight_decay=0.01,
    per_device_train_batch_size=16, gradient_accumulation_steps=2,
    fp16=torch.cuda.is_available(), warmup_ratio=0.06, logging_steps=50)

trainer = Trainer(
    model         = BertBig5(mdl_name, pos_w),
    args          = args,
    train_dataset = train_ds,
    eval_dataset  = val_ds,
    compute_metrics = compute_metrics,
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [18]:
trainer.train();      # ~40 min on a T4

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mposchyokotov[0m ([33mposchyokotov-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy
1,0.7985,0.794822,0.571765,0.6142,0.043972
2,0.762,0.782641,0.573908,0.607728,0.058156
3,0.6947,0.786406,0.586807,0.626796,0.102128
4,0.6022,0.836316,0.584651,0.629509,0.123404


In [19]:
print(trainer.evaluate(test_ds))

{'eval_loss': 0.7836877107620239, 'eval_f1_macro': 0.5882148560839964, 'eval_f1_micro': 0.6285685361409192, 'eval_accuracy': 0.1024113475177305, 'eval_runtime': 27.6836, 'eval_samples_per_second': 127.332, 'eval_steps_per_second': 15.93, 'epoch': 4.987234042553191}


In [23]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

predictions = trainer.predict(test_ds)
preds = predictions.predictions

Raw predictions shape: (3525, 5)
Final preds shape: (3525, 5)


In [31]:
probs = torch.sigmoid(torch.tensor(preds))

threshold = 0.3
pred_labels = (probs > threshold).int().numpy()


true_labels = np.stack(
    [test_df['O'], test_df['C'], test_df['E'], test_df['A'], test_df['N']],
    axis=1
)

acc = accuracy_score(true_labels, pred_labels)
f1_macro = f1_score(true_labels, pred_labels, average='macro')
f1_micro = f1_score(true_labels, pred_labels, average='micro')

print(f"Accuracy:   {acc:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")
print(f"F1 (micro): {f1_micro:.4f}")

true_labels shape: (3525, 5) dtype: int64
Unique in true_labels: [0 1]
Accuracy:   0.0604
F1 (macro): 0.5845
F1 (micro): 0.6133
