In [13]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from random import SystemRandom

import torch
import evaluate
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

In [14]:
current_dir = Path().resolve()
while not current_dir.name.endswith("xlm-roberta-base-cls-depression"):
    current_dir = current_dir.parent

os.chdir(current_dir)

input_train_data = current_dir / "data/clean/train.csv"
input_val_data = current_dir / "data/clean/val.csv"
output_model_dir = current_dir / "data/models/xlm-roberta-base-cls-depression"

os.makedirs(output_model_dir, exist_ok=True)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        encoded = tokenizer(
            dataframe['text'].tolist(),
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        self.input_ids = encoded['input_ids']
        self.attention_mask = encoded['attention_mask']
        self.labels = torch.tensor(dataframe['label'].tolist())

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }
    
    def select(self, indices):
        """Create a new dataset with only the selected indices"""
        new_input_ids = self.input_ids[indices]
        new_attention_mask = self.attention_mask[indices]
        new_labels = self.labels[indices]
        
        new_dataset = CustomDataset.__new__(CustomDataset)
        new_dataset.input_ids = new_input_ids
        new_dataset.attention_mask = new_attention_mask
        new_dataset.labels = new_labels
        return new_dataset
    
    def shuffle(self, seed=None):
        """Shuffle the dataset securely and return a new shuffled dataset"""
        indices = list(range(len(self)))
        SystemRandom().shuffle(indices)
        return self.select(indices)

In [16]:
training_df = pd.read_csv(input_train_data, encoding='utf-8', sep='|')
validation_df = pd.read_csv(input_val_data, encoding='utf-8', sep='|')
train_dataset = CustomDataset(training_df, tokenizer)
eval_dataset = CustomDataset(validation_df, tokenizer)
small_train_dataset = train_dataset.shuffle(seed=42).select(range(700))
small_eval_dataset = eval_dataset.shuffle(seed=42).select(range(300))

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base", num_labels=2, torch_dtype="auto")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    
    # Additional metrics
    precision = precision_score(labels, predictions, average='binary')
    recall = recall_score(labels, predictions, average='binary')
    f1 = f1_score(labels, predictions, average='binary')
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

training_args = TrainingArguments(
    output_dir=output_model_dir,
    eval_strategy="epoch",
    fp16=torch.cuda.is_available(),
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    num_train_epochs=10,
    save_strategy="epoch",
    load_best_model_at_end=True,    
    metric_for_best_model="precision",
    greater_is_better=True,
    save_total_limit=1,
    overwrite_output_dir=True,
    push_to_hub=False,
    save_safetensors=True,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    report_to='none'
    )

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5,
    early_stopping_threshold=0.001
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

In [19]:
trainer.train()

  0%|          | 0/5350 [00:00<?, ?it/s]

{'loss': 0.3042, 'grad_norm': 7.505672931671143, 'learning_rate': 1.8504672897196264e-05, 'epoch': 0.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.0612601600587368, 'eval_accuracy': 0.9821099979554283, 'eval_precision': 0.976094470046083, 'eval_recall': 0.9735708129847744, 'eval_f1': 0.9748310081979001, 'eval_runtime': 12.1476, 'eval_samples_per_second': 805.263, 'eval_steps_per_second': 12.595, 'epoch': 1.0}
{'loss': 0.0695, 'grad_norm': 8.783393859863281, 'learning_rate': 1.8089304257528557e-05, 'epoch': 1.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.058806486427783966, 'eval_accuracy': 0.9831322837865467, 'eval_precision': 0.9626116071428571, 'eval_recall': 0.9910945130709566, 'eval_f1': 0.9766454352441614, 'eval_runtime': 12.2174, 'eval_samples_per_second': 800.665, 'eval_steps_per_second': 12.523, 'epoch': 2.0}
{'loss': 0.047, 'grad_norm': 5.981130599975586, 'learning_rate': 1.6012461059190032e-05, 'epoch': 2.8}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.05304295942187309, 'eval_accuracy': 0.9871191985279084, 'eval_precision': 0.9716052853528254, 'eval_recall': 0.9928181557023844, 'eval_f1': 0.9820971867007673, 'eval_runtime': 12.0536, 'eval_samples_per_second': 811.542, 'eval_steps_per_second': 12.693, 'epoch': 3.0}
{'loss': 0.0286, 'grad_norm': 0.3641500473022461, 'learning_rate': 1.3935617860851506e-05, 'epoch': 3.74}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.045352041721343994, 'eval_accuracy': 0.9880392557759149, 'eval_precision': 0.9838895281933256, 'eval_recall': 0.9824762999138179, 'eval_f1': 0.9831824062095731, 'eval_runtime': 12.6587, 'eval_samples_per_second': 772.75, 'eval_steps_per_second': 12.087, 'epoch': 4.0}
{'loss': 0.0186, 'grad_norm': 0.01878722943365574, 'learning_rate': 1.1862928348909659e-05, 'epoch': 4.67}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.05643901228904724, 'eval_accuracy': 0.9900838274381517, 'eval_precision': 0.9823261117445838, 'eval_recall': 0.9899454179833381, 'eval_f1': 0.9861210473601374, 'eval_runtime': 11.8694, 'eval_samples_per_second': 824.136, 'eval_steps_per_second': 12.89, 'epoch': 5.0}
{'loss': 0.0123, 'grad_norm': 0.038373254239559174, 'learning_rate': 9.786085150571133e-06, 'epoch': 5.61}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.07234998047351837, 'eval_accuracy': 0.9874258842772439, 'eval_precision': 0.9729577464788732, 'eval_recall': 0.9922436081585752, 'eval_f1': 0.9825060446593656, 'eval_runtime': 11.8694, 'eval_samples_per_second': 824.133, 'eval_steps_per_second': 12.89, 'epoch': 6.0}
{'loss': 0.0088, 'grad_norm': 0.007156335283070803, 'learning_rate': 7.709241952232607e-06, 'epoch': 6.54}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.07995575666427612, 'eval_accuracy': 0.9885503986914741, 'eval_precision': 0.9844693701466781, 'eval_recall': 0.9833381212295318, 'eval_f1': 0.9839034205231388, 'eval_runtime': 12.2146, 'eval_samples_per_second': 800.842, 'eval_steps_per_second': 12.526, 'epoch': 7.0}
{'loss': 0.0073, 'grad_norm': 0.001571647939272225, 'learning_rate': 5.632398753894082e-06, 'epoch': 7.48}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.08823443949222565, 'eval_accuracy': 0.9881414843590268, 'eval_precision': 0.9740772048464356, 'eval_recall': 0.993105429474289, 'eval_f1': 0.9834992887624466, 'eval_runtime': 11.2859, 'eval_samples_per_second': 866.746, 'eval_steps_per_second': 13.557, 'epoch': 8.0}
{'loss': 0.005, 'grad_norm': 38.83103942871094, 'learning_rate': 3.555555555555556e-06, 'epoch': 8.41}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 0.07840672135353088, 'eval_accuracy': 0.9889593130239215, 'eval_precision': 0.9781684150836405, 'eval_recall': 0.9910945130709566, 'eval_f1': 0.9845890410958904, 'eval_runtime': 10.8835, 'eval_samples_per_second': 898.794, 'eval_steps_per_second': 14.058, 'epoch': 9.0}
{'train_runtime': 1592.5889, 'train_samples_per_second': 214.971, 'train_steps_per_second': 3.359, 'train_loss': 0.05238378181635776, 'epoch': 9.0}


TrainOutput(global_step=4815, training_loss=0.05238378181635776, metrics={'train_runtime': 1592.5889, 'train_samples_per_second': 214.971, 'train_steps_per_second': 3.359, 'total_flos': 8.107083082174464e+16, 'train_loss': 0.05238378181635776, 'epoch': 9.0})

In [20]:
trainer.save_model(output_model_dir)