In [None]:
!pip install transformers datasets sentencepiece nltk

import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
splits = {'train': 'Personality Datasets - Reddit/train_set.csv', 'validation': 'Personality Datasets - Reddit/val_set.csv', 'test': 'Personality Datasets - Reddit/eval_set.csv'}
df_pandora = pd.read_csv("hf://datasets/Fatima0923/Automated-Personality-Prediction/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_essays = pd.read_parquet("hf://datasets/jingjietan/essays-big5/" + splits["train"])

In [None]:
df_pandora.rename(
    columns={
        'openness': 'O',
        'conscientiousness' : 'C',
        'extraversion' : 'E',
        'agreeableness': 'A',
        'neuroticism' : 'N'
    },
    inplace=True
)


df_essays.drop(columns='__index_level_0__', inplace=True)
df_essays.drop(columns='ptype', inplace=True)
df_essays.head()

Unnamed: 0,O,C,E,A,N,text
0,1,0,0,1,1,it is wednesday. I can't wait until friday bec...
1,1,1,1,0,1,"wow, I want to go talk to the socialist organi..."
2,1,0,1,1,0,"I wish polygamy was still legal. Well, not pol..."
3,1,0,1,0,0,"Well, lets see . . . I guess the foremost thin..."
4,0,1,0,1,1,College? I wonder how it will be? I just ...


In [None]:
threshold = 50
for trait in ['O','C','E','A','N']:
    df_pandora[trait] = df_pandora[trait].apply(lambda x: 1 if float(x) > threshold else 0)
    df_essays[trait] = df_essays[trait].apply(lambda x: int(x) if not isinstance(x, int) else x)

df_combined = pd.concat([df_pandora[['text','O','C','E','A','N']],
                         df_essays[['text','O','C','E','A','N']]]).reset_index(drop=True)



In [None]:
import numpy as np

for trait in ["O", "C", "E", "A", "N"]:
    values = df_combined[trait].unique()

    if not np.all(np.isin(values, [0, 1])):
        df_combined[trait] = df_combined[trait].apply(lambda x: int(x) if not isinstance(x, int) else x)

    print(df_combined[trait].value_counts(normalize=True))
    print("-" * 30)

=== Checking label distribution ===
O unique labels: [1 0]
O
1    0.714326
0    0.285674
Name: proportion, dtype: float64
------------------------------
C unique labels: [0 1]
C
0    0.779745
1    0.220255
Name: proportion, dtype: float64
------------------------------
E unique labels: [0 1]
E
0    0.667801
1    0.332199
Name: proportion, dtype: float64
------------------------------
A unique labels: [0 1]
A
0    0.703262
1    0.296738
Name: proportion, dtype: float64
------------------------------
N unique labels: [1 0]
N
0    0.542695
1    0.457305
Name: proportion, dtype: float64
------------------------------


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_combined, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
def chunk_text(text, tokenizer, max_length=512):
    import nltk
    nltk.download('punkt', quiet=True)
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sent in sentences:
        tokens = tokenizer.tokenize(sent)
        if len(tokens) == 0:
            continue
        if current_length + len(tokens) <= (max_length - 2):
            current_chunk.append(sent)
            current_length += len(tokens)
        else:
            chunk_str = " ".join(current_chunk)
            chunks.append(chunk_str)
            current_chunk = [sent]
            current_length = len(tokens)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
class PersonalityDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['text']
        labels = torch.tensor([row['O'], row['C'], row['E'], row['A'], row['N']], dtype=torch.float)

        # chunk
        text_chunks = chunk_text(text, self.tokenizer, self.max_length)

        tokenized_chunks = []
        for chunk_str in text_chunks:
            encoded = self.tokenizer(
                chunk_str,
                max_length=self.max_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            tokenized_chunks.append(encoded)

        return {
            'tokenized_chunks': tokenized_chunks,
            'labels': labels
        }

In [None]:
traits = ["O","C","E","A","N"]
pos_weights = []
for trait in traits:
    positives = train_df[trait].sum()
    negatives = len(train_df) - positives
    if positives == 0:
        ratio = 1.0
    else:
        ratio = negatives / positives
    pos_weights.append(ratio)

pos_weights_tensor = torch.tensor(pos_weights, dtype=torch.float)
print("pos_weights:", pos_weights_tensor)

pos_weights: tensor([0.4098, 3.4940, 1.9833, 2.3772, 1.1498])


In [None]:
import torch.nn as nn
from transformers import AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput

class BertForBig5Classification(nn.Module):
    def __init__(self, model_name="distilbert-base-uncased", num_labels=5, pos_weight=None, pooling='mean'):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

        if hasattr(self.encoder.config, "hidden_size"):
            self.hidden_size = self.encoder.config.hidden_size
        elif hasattr(self.encoder.config, "dim"):
            self.hidden_size = self.encoder.config.dim
        else:
            raise ValueError("Cannot find hidden size in config.")

        if hasattr(self.encoder.config, "hidden_dropout_prob"):
            dropout_prob = self.encoder.config.hidden_dropout_prob
        elif hasattr(self.encoder.config, "dropout"):
            dropout_prob = self.encoder.config.dropout
        else:
            dropout_prob = 0.1

        self.dropout = nn.Dropout(dropout_prob)

        self.o_head = nn.Linear(self.hidden_size, 1)
        self.c_head = nn.Linear(self.hidden_size, 1)
        self.e_head = nn.Linear(self.hidden_size, 1)
        self.a_head = nn.Linear(self.hidden_size, 1)
        self.n_head = nn.Linear(self.hidden_size, 1)

        if pos_weight is not None:
            self.loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        else:
            self.loss_fct = nn.BCEWithLogitsLoss()

        self.pooling = pooling

    def forward(self, tokenized_chunks, labels=None):
        device = next(self.parameters()).device
        batch_size = len(tokenized_chunks)

        all_logits = []
        for i in range(batch_size):
            sample_chunks = tokenized_chunks[i]
            chunk_embs = []

            for chunk_dict in sample_chunks:
                inputs = {
                    "input_ids": chunk_dict["input_ids"].to(device),
                    "attention_mask": chunk_dict["attention_mask"].to(device),
                }
                if "token_type_ids" in chunk_dict and chunk_dict["token_type_ids"] is not None:
                    inputs["token_type_ids"] = chunk_dict["token_type_ids"].to(device)

                outputs = self.encoder(**inputs)
                cls_emb = outputs.last_hidden_state[:, 0, :]  # (1, hidden_size)
                cls_emb = cls_emb.squeeze(0)                  # (hidden_size,)
                chunk_embs.append(cls_emb)

            if len(chunk_embs) == 0:
                chunk_embs = [torch.zeros(self.hidden_size, device=device)]

            chunk_embs_tensor = torch.stack(chunk_embs, dim=0)  # (num_chunks, hidden_size)
            if self.pooling == 'mean':
                pooled_emb = chunk_embs_tensor.mean(dim=0)
            else:
                pooled_emb = chunk_embs_tensor.max(dim=0)[0]

            pooled_emb = self.dropout(pooled_emb)

            o_logit = self.o_head(pooled_emb)
            c_logit = self.c_head(pooled_emb)
            e_logit = self.e_head(pooled_emb)
            a_logit = self.a_head(pooled_emb)
            n_logit = self.n_head(pooled_emb)

            logits = torch.cat([o_logit, c_logit, e_logit, a_logit, n_logit], dim=-1)
            all_logits.append(logits)

        all_logits = torch.stack(all_logits, dim=0)

        loss = None
        if labels is not None:
            labels = labels.to(device)
            loss = self.loss_fct(all_logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=all_logits
        )

In [None]:
def collate_fn(batch):
    tokenized_chunks_batch = [x['tokenized_chunks'] for x in batch]
    labels_batch = torch.stack([x['labels'] for x in batch], dim=0)
    return {
        'tokenized_chunks': tokenized_chunks_batch,
        'labels': labels_batch
    }

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = PersonalityDataset(train_df, tokenizer, max_length=512)
val_dataset = PersonalityDataset(val_df, tokenizer, max_length=512)
test_dataset = PersonalityDataset(test_df, tokenizer, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
model = BertForBig5Classification(
    model_name=model_name,
    num_labels=5,
    pos_weight=None,
    pooling='mean'
)

training_args = TrainingArguments(
    output_dir="./big5-bert-chunking",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    warmup_ratio=0.1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mposchyokotov[0m ([33mposchyokotov-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.7527,0.772914
2,0.7098,0.768906


Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors


TrainOutput(global_step=2820, training_loss=0.7532917034541462, metrics={'train_runtime': 768.6462, 'train_samples_per_second': 29.35, 'train_steps_per_second': 3.669, 'total_flos': 0.0, 'train_loss': 0.7532917034541462, 'epoch': 2.0})

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

predictions = trainer.predict(test_dataset)
preds = predictions.predictions

NameError: name 'trainer' is not defined

In [None]:
probs = torch.sigmoid(torch.tensor(preds))

threshold = 0.45
pred_labels = (probs > threshold).int().numpy()

true_labels = np.stack(
    [test_df['O'], test_df['C'], test_df['E'], test_df['A'], test_df['N']],
    axis=1
)

acc = accuracy_score(true_labels, pred_labels)
f1_macro = f1_score(true_labels, pred_labels, average='macro')
f1_micro = f1_score(true_labels, pred_labels, average='micro')

print(f"Accuracy:   {acc:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")
print(f"F1 (micro): {f1_micro:.4f}")

true_labels shape: (3525, 5) dtype: int64
Unique in true_labels: [0 1]
Accuracy:   0.2009
F1 (macro): 0.5170
F1 (micro): 0.6175


In [None]:
for i, trait in enumerate(['O','C','E','A','N']):
    thresholds = np.linspace(0.0, 1.0, 101)
    best_thr, best_f1 = 0.5, 0
    for thr in thresholds:
        pred_labels_trait = (probs[:, i] > thr).int()
        f1_trait = f1_score(true_labels[:, i], pred_labels_trait)
        if f1_trait > best_f1:
            best_f1, best_thr = f1_trait, thr
    print(f"Best threshold for {trait} = {best_thr}, F1={best_f1:.4f}")


Best threshold for O = 0.49, F1=0.8441
Best threshold for C = 0.19, F1=0.4377
Best threshold for E = 0.31, F1=0.5178
Best threshold for A = 0.17, F1=0.4837
Best threshold for N = 0.32, F1=0.6326


In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

probs = torch.sigmoid(torch.tensor(preds))

true_labels = np.stack(
    [test_df['O'], test_df['C'], test_df['E'], test_df['A'], test_df['N']],
    axis=1
)


traits = ['O','C','E','A','N']
best_thresholds = []

for i, trait in enumerate(traits):
    thresholds = np.linspace(0.0, 1.0, 101)
    best_thr, best_f1 = 0.5, 0.0
    for thr in thresholds:
        pred_labels_trait = (probs[:, i] > thr).int()
        f1_trait = f1_score(true_labels[:, i], pred_labels_trait)
        if f1_trait > best_f1:
            best_f1, best_thr = f1_trait, thr
    best_thresholds.append(best_thr)
    print(f"Best threshold for {trait} = {best_thr:.3f}, F1={best_f1:.4f}")

pred_labels = np.zeros_like(true_labels)
for i, thr in enumerate(best_thresholds):
    pred_labels[:, i] = (probs[:, i] > thr).int()

acc = accuracy_score(true_labels, pred_labels)
f1_macro = f1_score(true_labels, pred_labels, average='macro')
f1_micro = f1_score(true_labels, pred_labels, average='micro')

print(f"Accuracy:   {acc:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")
print(f"F1 (micro): {f1_micro:.4f}")


true_labels shape: (3525, 5) dtype: int64
Unique in true_labels: [0 1]
Best threshold for O = 0.460, F1=0.8433
Best threshold for C = 0.320, F1=0.3949
Best threshold for E = 0.290, F1=0.4989
Best threshold for A = 0.260, F1=0.4641
Best threshold for N = 0.260, F1=0.6218

Final Results using trait-specific thresholds:
Accuracy:   0.0372
F1 (macro): 0.5646
F1 (micro): 0.5975


  pred_labels[:, i] = (probs[:, i] > thr).int()
