In [9]:
import numpy as np
import torch
import pandas as pd
from datasets import ClassLabel, Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

test_size = 0.2
num_classes = 2
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
set_class_weights = True
model_version = 'xlm-roberta-base'

data = pd.read_csv('data_sample/train.csv')

# Split train and test
X_train, X_test, y_train, y_test = train_test_split(
    data['text'],
    data['label'],
    test_size=test_size,
    random_state=42, 
    stratify=data['label']
)

train = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train}))
test = Dataset.from_pandas(pd.DataFrame({'text': X_test, 'label': y_test}))

# Load model and tokenizer

In [11]:
if set_class_weights:
    class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
else:
    class_weights = None

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_version)
model = AutoModelForSequenceClassification.from_pretrained(model_version, num_labels=num_classes).to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenize dataset

In [12]:
def tokenize_batch(batch):
    tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512)
   # tokens['label'] = labels.str2int(batch['label'])
    return tokens

train_tokenized = train.map(tokenize_batch, batched=True, batch_size=len(train))
test_tokenized = test.map(tokenize_batch, batched=True, batch_size=len(test))

train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# Train the model

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}


training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
#   warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy='steps',
    eval_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=1,
    optim='adamw_torch',
    fp16=True,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    save_strategy='steps',
    save_steps=50
)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


trainer = CustomTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_tokenized,         # training dataset
    eval_dataset=test_tokenized,             # evaluation dataset
    compute_metrics=compute_metrics
)


In [15]:
trainer.train()

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6933073401451111, 'eval_f1': 0.5184592145015106, 'eval_accuracy': 0.655, 'eval_runtime': 0.9216, 'eval_samples_per_second': 217.02, 'eval_steps_per_second': 14.106, 'epoch': 0.4}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6911154389381409, 'eval_f1': 0.5184592145015106, 'eval_accuracy': 0.655, 'eval_runtime': 0.9245, 'eval_samples_per_second': 216.335, 'eval_steps_per_second': 14.062, 'epoch': 0.8}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6889051795005798, 'eval_f1': 0.3537420232241865, 'eval_accuracy': 0.42, 'eval_runtime': 0.9208, 'eval_samples_per_second': 217.191, 'eval_steps_per_second': 14.117, 'epoch': 1.2}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6089941263198853, 'eval_f1': 0.6778547451937748, 'eval_accuracy': 0.67, 'eval_runtime': 0.9274, 'eval_samples_per_second': 215.666, 'eval_steps_per_second': 14.018, 'epoch': 1.6}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6106681227684021, 'eval_f1': 0.7256923076923077, 'eval_accuracy': 0.73, 'eval_runtime': 0.9255, 'eval_samples_per_second': 216.093, 'eval_steps_per_second': 14.046, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.5805345177650452, 'eval_f1': 0.6776339285714286, 'eval_accuracy': 0.67, 'eval_runtime': 0.9215, 'eval_samples_per_second': 217.029, 'eval_steps_per_second': 14.107, 'epoch': 2.4}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6060137152671814, 'eval_f1': 0.7209230769230769, 'eval_accuracy': 0.72, 'eval_runtime': 0.9202, 'eval_samples_per_second': 217.334, 'eval_steps_per_second': 14.127, 'epoch': 2.8}
{'train_runtime': 44.6605, 'train_samples_per_second': 53.739, 'train_steps_per_second': 1.679, 'train_loss': 0.6070699055989583, 'epoch': 3.0}


TrainOutput(global_step=75, training_loss=0.6070699055989583, metrics={'train_runtime': 44.6605, 'train_samples_per_second': 53.739, 'train_steps_per_second': 1.679, 'train_loss': 0.6070699055989583, 'epoch': 3.0})