In [12]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

Added e:\repo\DistilBERTFinancialSentiment to sys.path


In [13]:
model_name = 'xlm-roberta-finetuned-financial-news-sentiment-analysis-european'

In [14]:
import wandb

wandb.login()

True

In [15]:
from datasets import load_dataset
ds = load_dataset("nojedag/financial_phrasebank_multilingual")

In [16]:
# Define the transform_labels function to handle batched input
def transform_labels(examples):
	label_map = {"negative": 0, "neutral": 1, "positive": 2}
	if isinstance(examples['sentiment'], list):
		examples['labels'] = [label_map[s.lower()] for s in examples['sentiment']]
	else:
		examples['labels'] = label_map[examples['sentiment'].lower()]
	return examples

dataset = ds.map(transform_labels, batched=True)

In [17]:
# let's train a Distilbert model

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-base', num_labels=3)

# let's tokenize the data for the model to be able to understand
def tokenize_data(example):
    return tokenizer(example['sentence'], padding='max_length', truncation=True)    

In [18]:
dataset = dataset.map(tokenize_data, batched=True)

In [19]:
from transformers import AutoModelForSequenceClassification

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base", num_labels=3)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# the default batch size for training arguments
batch_size = 32

# set number of epochs
number_of_epochs = 4
# let set the logging steps
logging_steps = len(dataset['train']) // batch_size # it should log each batch 

steps = (len(dataset['train']) / batch_size) * number_of_epochs
warmup_steps = int(0.1 * steps)

In [21]:
import torch
from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, weight_tensor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor.to(self.model.device))

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [22]:
import numpy as np
import evaluate

metric=evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1).numpy()
    return metric.compute(predictions=predictions, references=labels)

In [23]:
from transformers import TrainingArguments
from toolbox.utils import get_output_dir

training_args = TrainingArguments(
                                  num_train_epochs=number_of_epochs, 
                                  load_best_model_at_end=True,
                                  eval_strategy='steps', 
                                  save_strategy='steps',
                                  learning_rate=2e-5,
                                  weight_decay=0.01,  # added weight decay
                                  logging_steps=logging_steps,
                                  warmup_steps= warmup_steps,
                                  save_steps=1000,
                                  eval_steps=500,
                                  output_dir=get_output_dir(model_name),
                                  report_to="wandb",
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  gradient_accumulation_steps=2,  # accumulate gradients over 2 steps
                                  fp16=True
                                )

In [24]:
train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['test'].shuffle(seed=10)

In [25]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
from transformers import Trainer
weight_tensor = torch.tensor([1.0, 2.0, 3.0])

trainer = CustomTrainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, weight_tensor=weight_tensor, data_collator=data_collator, compute_metrics=compute_metrics
)

In [27]:
# Launch the learning process: training
# Load model from checkpoint
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
500,No log,0.332211,0.818856
1000,0.560300,0.295033,0.839538


TrainOutput(global_step=1024, training_loss=0.39249103268957697, metrics={'train_runtime': 831.1382, 'train_samples_per_second': 78.721, 'train_steps_per_second': 1.232, 'total_flos': 1.7214984695107584e+16, 'train_loss': 0.39249103268957697, 'epoch': 4.0})

In [28]:
trainer_eval = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [29]:
trainer_eval.evaluate()

{'eval_loss': 0.3846326172351837,
 'eval_model_preparation_time': 0.002,
 'eval_accuracy': 0.8395378690629012,
 'eval_runtime': 22.9608,
 'eval_samples_per_second': 305.346,
 'eval_steps_per_second': 9.582}

In [30]:
# Evaluate in synthetic data
synthetic_ds = load_dataset("nojedag/synthetic_financial_sentiment")
synthetic_ds = synthetic_ds.map(transform_labels, batched=True)

In [31]:
tokenized_synthetic = synthetic_ds.map(lambda ex: tokenize_data(ex), batched=True, remove_columns=['sentence', 'lang'])
synthetic_train_dataset = tokenized_synthetic['train'].shuffle(seed=10)
synthetic_eval_dataset = tokenized_synthetic['test'].shuffle(seed=10)

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

In [32]:
synthetic_trainer_eval = Trainer(
    model=model,
    args=training_args,
    train_dataset=synthetic_train_dataset,
    eval_dataset=synthetic_eval_dataset,
    compute_metrics=compute_metrics
)

In [33]:
synthetic_trainer_eval.evaluate()

{'eval_loss': 1.09870183467865,
 'eval_model_preparation_time': 0.003,
 'eval_accuracy': 0.6666666666666666,
 'eval_runtime': 0.3481,
 'eval_samples_per_second': 249.921,
 'eval_steps_per_second': 8.618}

In [34]:
model.save_pretrained(model_name)
model.push_to_hub(f'nojedag/{model_name}')

README.md:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nojedag/xlm-roberta-finetuned-financial-news-sentiment-analysis-european/commit/ab1532ecfaeb0e7dafaa644634fe65e7d4e1f687', commit_message='Upload XLMRobertaForSequenceClassification', commit_description='', oid='ab1532ecfaeb0e7dafaa644634fe65e7d4e1f687', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nojedag/xlm-roberta-finetuned-financial-news-sentiment-analysis-european', endpoint='https://huggingface.co', repo_type='model', repo_id='nojedag/xlm-roberta-finetuned-financial-news-sentiment-analysis-european'), pr_revision=None, pr_num=None)

In [35]:
trainer.push_to_hub()
trainer_eval.push_to_hub()

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nojedag/xlm-roberta-finetuned-financial-news-sentiment-analysis-european/commit/13b5c704ec4c1ff210bb714a25e152cf31f79caf', commit_message='End of training', commit_description='', oid='13b5c704ec4c1ff210bb714a25e152cf31f79caf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nojedag/xlm-roberta-finetuned-financial-news-sentiment-analysis-european', endpoint='https://huggingface.co', repo_type='model', repo_id='nojedag/xlm-roberta-finetuned-financial-news-sentiment-analysis-european'), pr_revision=None, pr_num=None)

In [36]:
wandb.finish()

0,1
eval/accuracy,▇██▁
eval/loss,▁▁▂█
eval/model_preparation_time,▁█
eval/runtime,███▁
eval/samples_per_second,▇▇█▁
eval/steps_per_second,▆▇█▁
train/epoch,▁▁███
train/global_step,▄▄███▁▁
train/grad_norm,▁█
train/learning_rate,█▁

0,1
eval/accuracy,0.66667
eval/loss,1.0987
eval/model_preparation_time,0.003
eval/runtime,0.3481
eval/samples_per_second,249.921
eval/steps_per_second,8.618
total_flos,1.7214984695107584e+16
train/epoch,4.0
train/global_step,0.0
train/grad_norm,18.0472
