In [None]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

In [1]:
model_name = "FacebookAI/xlm-roberta-base"

In [2]:
def tokenize_data(example, tokenizer):
	return tokenizer(example['sentence'], padding='max_length', truncation=True)

def get_tokenized_lang_dataset(tokenizer, dataset, lang):
	raw_dataset = dataset.filter(lambda example: example['lang'] == lang)
	return raw_dataset.map(lambda example: tokenize_data(example, tokenizer), batched=True)

# Define the transform_labels function to handle batched input
def transform_labels(examples):
	label_map = {"negative": 0, "neutral": 1, "positive": 2}
	if isinstance(examples['sentiment'], list):
		examples['labels'] = [label_map[s.lower()] for s in examples['sentiment']]
	else:
		examples['labels'] = label_map[examples['sentiment'].lower()]
	return examples

In [3]:
import torch
from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, weight_tensor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor.to(self.model.device))

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss




In [4]:
import numpy as np
import evaluate

metric=evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1).numpy()
    return metric.compute(predictions=predictions, references=labels)

In [5]:
def get_hyperparameters(dataset, batch_size=32, number_of_epochs=4):
    # let set the logging steps
    logging_steps = len(dataset['train']) // batch_size # it should log each batch 

    steps = (len(dataset['train']) / batch_size) * number_of_epochs
    warmup_steps = int(0.1 * steps)
    print(f"Train size {len(dataset['train'])}")
    print(f"Number of training steps: {steps}")
    print(f"Number of warmup steps: {warmup_steps}")
    print(f"Logging steps: {logging_steps}")
    print(f"Batch size: {batch_size}")
    print(f"Number of epochs: {number_of_epochs}")
    return batch_size, number_of_epochs, logging_steps, warmup_steps

In [None]:
from transformers import TrainingArguments
from toolbox.utils import get_output_dir

def get_training_args(model_name, batch_size, number_of_epochs, logging_steps, warmup_steps, lang):
  return TrainingArguments(
    num_train_epochs=number_of_epochs, 
    load_best_model_at_end=True,
    eval_strategy='steps', 
    save_strategy='steps',
    learning_rate=2e-5,
    weight_decay=0.01,  # added weight decay
    logging_steps=logging_steps,
    warmup_steps= warmup_steps,
    save_steps=1000,
    eval_steps=500,
    output_dir=get_output_dir(f'{model_name}-{lang}'),
    report_to="wandb",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,  # accumulate gradients over 2 steps
    fp16=True,
    run_name=f'{model_name}-{lang}'
)

In [7]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import DataCollatorWithPadding

def fine_tune_language(dataset, lang):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    lang_dataset = get_tokenized_lang_dataset(tokenizer, dataset, lang)
    training_args = get_training_args(model_name, *get_hyperparameters(lang_dataset), lang=lang)

    train_dataset = lang_dataset['train'].shuffle(seed=10) 
    eval_dataset = lang_dataset['test'].shuffle(seed=10)

    weight_tensor = torch.tensor([1.0, 2.0, 3.0])
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        weight_tensor=weight_tensor,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    trainer.train()

    trainer_eval = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )
    trainer_eval.evaluate()

    return model

In [8]:
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

project_name = "xlm-roberta-base-finetuned-financial-phrasebank"
langs = ['en', 'de', 'es', 'fr']

# Load the models from the saved directories
teacher_models = {}
langs_to_fine_tune = []
for lang in langs:
    model_dir = get_output_dir(f'{project_name}-{lang}')
    if os.path.exists(model_dir):
        try:
            # Attempt to load the model from the directory
            teacher_models[lang] = AutoModelForSequenceClassification.from_pretrained(model_dir)
            teacher_models[lang].to(device)  # Move the model to the appropriate device
        except Exception as e:
            print(f"Error loading model for {lang}: {e}")
            langs_to_fine_tune.append(lang)
    else:
        print(f"Model directory for {lang} does not exist: {model_dir}")
        langs_to_fine_tune.append(lang)
        continue
    

Using device: cuda


In [9]:
from datasets import load_dataset
ds = load_dataset("nojedag/financial_phrasebank_multilingual")
complete_dataset = ds.map(transform_labels, batched=True)
for lang in langs_to_fine_tune:
    print(f"Training model for {lang} language")
    teacher_models[lang] = fine_tune_language(complete_dataset, lang)
    print(f"Model for {lang} language trained successfully")
    teacher_models[lang].save_pretrained(get_output_dir(f'{project_name}-{lang}'))


In [10]:
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-base', num_labels=3)
dataset = complete_dataset.map(transform_labels, batched=True)
dataset = complete_dataset.map(lambda example: tokenize_data(example, tokenizer), batched=True)

train_dataset = dataset['train']
eval_dataset = dataset['test']

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "sentiment", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "sentiment", "labels"])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=True)

In [11]:
student_model = AutoModelForSequenceClassification.from_pretrained('FacebookAI/xlm-roberta-base', num_labels=3)
student_model.to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [12]:
from torch import nn

kl_loss_fn = nn.KLDivLoss(reduction="batchmean")
ce_loss_fn = nn.CrossEntropyLoss()
temperature = 2.0
alpha = 0.5

def distillation_loss(student_logits, teacher_logits, temperature):
    # Scale the logits and compute probabilities
    student_log_probs = nn.functional.log_softmax(student_logits / temperature, dim=-1)
    teacher_probs = nn.functional.softmax(teacher_logits / temperature, dim=-1)
    return kl_loss_fn(student_log_probs, teacher_probs) * (temperature ** 2)

In [None]:
optimizer = torch.optim.Adam(student_model.parameters(), lr=5e-5)
from torch.utils.tensorboard import SummaryWriter
scaler = torch.amp.GradScaler()
num_epochs = 5  # Example: 3 epochs
global_step = 0

In [None]:
student_model.train()
# Initialize TensorBoard writer for logging
writer = SummaryWriter("runs/student_model_logs")
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()


        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            with torch.no_grad():
                teacher_logits_sum = None
                for lang in langs:
                    teacher = teacher_models[lang]
                    teacher.eval()
                    outputs = teacher(input_ids=input_ids, attention_mask=attention_mask)
                    if teacher_logits_sum is None:
                        teacher_logits_sum = outputs.logits
                    else:
                        teacher_logits_sum += outputs.logits
                teacher_logits_avg = teacher_logits_sum / len(teacher_models)
            
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
            student_logits = student_outputs.logits

            loss_ce = ce_loss_fn(student_logits, labels)
            loss_kd = distillation_loss(student_logits, teacher_logits_avg, temperature)
            loss = alpha * loss_ce + (1 - alpha) * loss_kd

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Log training loss
        writer.add_scalar("Training Loss", loss.item(), global_step)
        global_step += 1

        if global_step % 100 == 0:
            print(f"Epoch: {epoch}, Step: {global_step}, Loss: {loss.item()}")

Epoch: 0, Step: 100, Loss: 0.590625524520874
Epoch: 0, Step: 200, Loss: 0.419099897146225
Epoch: 0, Step: 300, Loss: 0.3767983615398407
Epoch: 0, Step: 400, Loss: 0.3011900782585144
Epoch: 0, Step: 500, Loss: 0.3580879867076874
Epoch: 1, Step: 600, Loss: 0.34319907426834106
Epoch: 1, Step: 700, Loss: 0.37530267238616943
Epoch: 1, Step: 800, Loss: 0.32280367612838745
Epoch: 1, Step: 900, Loss: 0.4464535117149353
Epoch: 1, Step: 1000, Loss: 0.26513671875
Epoch: 2, Step: 1100, Loss: 0.29456648230552673
Epoch: 2, Step: 1200, Loss: 0.35158029198646545
Epoch: 2, Step: 1300, Loss: 0.24187342822551727
Epoch: 2, Step: 1400, Loss: 0.2717231810092926
Epoch: 2, Step: 1500, Loss: 0.25153881311416626


In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [16]:
# Final evaluation at the end of training
final_metrics = evaluate(student_model, eval_loader, device)
print("Final Evaluation Metrics:", final_metrics)
writer.add_text("Final Metrics", str(final_metrics))
writer.close()  # Close the TensorBoard writer when done

Final Evaluation Metrics: {'accuracy': 0.809014405933533, 'precision': 0.8205275349631127, 'recall': 0.809014405933533, 'f1': 0.776328489324103}


In [None]:
synthetic = load_dataset("nojedag/synthetic_financial_sentiment")
synthetic_data = ds.map(transform_labels, batched=True)


In [None]:
student_model.save_pretrained("saved_student_model")
tokenizer.save_pretrained("saved_student_model")
print("Student model and tokenizer saved to 'saved_student_model'")

Student model and tokenizer saved to 'saved_student_model'
