In [None]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

In [11]:
model_name = "distilroberta-finetuned-financial-news-sentiment-analysis-european"

In [12]:
import wandb

wandb.login()

True

In [13]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("nojedag/financial_phrasebank_multilingual")

README.md:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/526k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5258 [00:00<?, ? examples/s]

In [None]:
from toolbox.utils import transform_labels
dataset = ds.map(transform_labels)

Map:   0%|          | 0/12268 [00:00<?, ? examples/s]

Map:   0%|          | 0/5258 [00:00<?, ? examples/s]

In [15]:
# let's train a Distilbert model

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis')

# let's tokenize the data for the model to be able to understand
def tokenize_data(example):
    return tokenizer(example['sentence'], padding='max_length')    

In [16]:
dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/12268 [00:00<?, ? examples/s]

Map:   0%|          | 0/5258 [00:00<?, ? examples/s]

In [17]:
from transformers import AutoModelForSequenceClassification

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", num_labels=3)

In [18]:
# the default batch size for training arguments
batch_size = 8

# set number of epochs
number_of_epochs = 7
# let set the logging steps
logging_steps = len(dataset['train']) // batch_size # it should log each batch 

steps = (len(dataset['train']) / batch_size) * number_of_epochs
warmup_steps = int(0.2 * steps)

In [None]:
from transformers import TrainingArguments
from toolbox.utils import get_output_dir

training_args = TrainingArguments(
    num_train_epochs=number_of_epochs, 
    load_best_model_at_end=True,
    eval_strategy='steps', 
    save_strategy='steps',
    learning_rate=2e-5,
    logging_steps=logging_steps,
    warmup_steps= warmup_steps,
    save_steps=1000,
    eval_steps=500,
    output_dir=get_output_dir(model_name),
    report_to='wandb'   
)

In [20]:
train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['test'].shuffle(seed=10)

In [21]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)




In [22]:
# Launch the learning process: training
# Load model from checkpoint
trainer.train()



Step,Training Loss,Validation Loss
500,No log,0.841141
1000,No log,0.822318
1500,No log,0.80705
2000,1.030300,0.727346
2500,1.030300,0.704043
3000,1.030300,0.651728
3500,0.700700,0.688775
4000,0.700700,0.662357
4500,0.700700,0.683962
5000,0.558100,0.766817


TrainOutput(global_step=10738, training_loss=0.5339942770444785, metrics={'train_runtime': 1704.2585, 'train_samples_per_second': 50.389, 'train_steps_per_second': 6.301, 'total_flos': 1.1375973197697024e+16, 'train_loss': 0.5339942770444785, 'epoch': 7.0})

In [None]:
from toolbox.utils import compute_metrics

trainer_eval = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [24]:
trainer_eval.evaluate()

{'eval_loss': 0.6365763545036316,
 'eval_model_preparation_time': 0.0015,
 'eval_accuracy': {'accuracy': 0.7542791936097375},
 'eval_f1score': {'f1': 0.743367499723361},
 'eval_runtime': 23.9922,
 'eval_samples_per_second': 219.154,
 'eval_steps_per_second': 27.426}

In [25]:
model.save_pretrained(model_name)
model.push_to_hub(f'nojedag/{model_name}')

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nojedag/distilroberta-finetuned-financial-news-sentiment-analysis-european/commit/f6ce247391ae90a76c0f39761a3ca37afc102b20', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='f6ce247391ae90a76c0f39761a3ca37afc102b20', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nojedag/distilroberta-finetuned-financial-news-sentiment-analysis-european', endpoint='https://huggingface.co', repo_type='model', repo_id='nojedag/distilroberta-finetuned-financial-news-sentiment-analysis-european'), pr_revision=None, pr_num=None)

In [26]:
trainer.push_to_hub()
trainer_eval.push_to_hub()

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nojedag/distilroberta-finetuned-financial-news-sentiment-analysis-european/commit/371be9f9daeedeb64d2eefa2ec614aa800a0b8fa', commit_message='End of training', commit_description='', oid='371be9f9daeedeb64d2eefa2ec614aa800a0b8fa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nojedag/distilroberta-finetuned-financial-news-sentiment-analysis-european', endpoint='https://huggingface.co', repo_type='model', repo_id='nojedag/distilroberta-finetuned-financial-news-sentiment-analysis-european'), pr_revision=None, pr_num=None)

In [27]:
wandb.finish()

0,1
eval/loss,▅▄▄▃▂▁▂▁▂▃▂▁▄▅▄▆▆▆▆█▇▁
eval/model_preparation_time,▁
eval/runtime,▃▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▂▃█
eval/samples_per_second,▆███████████▇▇████▇▇▆▁
eval/steps_per_second,▆███████████▇▇████▇▇▆▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████▁
train/grad_norm,▅█▂█▁▆▂
train/learning_rate,▇█▇▅▄▂▁
train/loss,█▅▄▃▂▁▁

0,1
eval/loss,0.63658
eval/model_preparation_time,0.0015
eval/runtime,23.9922
eval/samples_per_second,219.154
eval/steps_per_second,27.426
total_flos,1.1375973197697024e+16
train/epoch,7.0
train/global_step,0.0
train/grad_norm,8.01559
train/learning_rate,0.0
