In [None]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

In [10]:
model_name = "finbert-european"

In [11]:
import wandb

wandb.login()

True

In [12]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("nojedag/financial_phrasebank_multilingual")

In [None]:
from toolbox.utils import transform_labels
dataset = ds.map(transform_labels)

In [14]:
# let's train a Distilbert model

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')

# let's tokenize the data for the model to be able to understand
def tokenize_data(example):
    return tokenizer(example['sentence'], padding='max_length')    

In [15]:
dataset = dataset.map(tokenize_data, batched=True)

In [16]:
from transformers import AutoModelForSequenceClassification

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3)

In [17]:
# the default batch size for training arguments
batch_size = 8

# set number of epochs
number_of_epochs = 7
# let set the logging steps
logging_steps = len(dataset['train']) // batch_size # it should log each batch 

steps = (len(dataset['train']) / batch_size) * number_of_epochs
warmup_steps = int(0.2 * steps)

In [None]:
from transformers import TrainingArguments
from toolbox.utils import get_output_dir

training_args = TrainingArguments(
    num_train_epochs=number_of_epochs, 
    load_best_model_at_end=True,
    eval_strategy='steps', 
    save_strategy='steps',
    learning_rate=2e-5,
    logging_steps=logging_steps,
    warmup_steps= warmup_steps,
    save_steps=1000,
    eval_steps=500,
    output_dir=get_output_dir(model_name),
    report_to='wandb'
)

In [19]:
train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['test'].shuffle(seed=10)

In [20]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)




In [21]:
# Launch the learning process: training
# Load model from checkpoint
trainer.train()



Step,Training Loss,Validation Loss
500,No log,0.869345
1000,No log,0.835812
1500,No log,0.849044
2000,0.942500,0.759386
2500,0.942500,0.729441
3000,0.942500,0.653877
3500,0.703400,0.724578
4000,0.703400,0.673453
4500,0.703400,0.70949
5000,0.496100,0.775336


TrainOutput(global_step=10738, training_loss=0.44890753558215357, metrics={'train_runtime': 3188.4498, 'train_samples_per_second': 26.933, 'train_steps_per_second': 3.368, 'total_flos': 2.259512786080973e+16, 'train_loss': 0.44890753558215357, 'epoch': 7.0})

In [None]:
from toolbox.utils import compute_metrics

trainer_eval = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [23]:
trainer_eval.evaluate()

{'eval_loss': 0.6538774967193604,
 'eval_model_preparation_time': 0.002,
 'eval_accuracy': {'accuracy': 0.7261316089767973},
 'eval_f1score': {'f1': 0.7096333691540592},
 'eval_runtime': 45.4933,
 'eval_samples_per_second': 115.577,
 'eval_steps_per_second': 14.464}

In [24]:
model.save_pretrained(model_name)
model.push_to_hub(f'nojedag/{model_name}')

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nojedag/finbert-european/commit/13570c87d637e66e393c2964a4e35e17db27b379', commit_message='Upload BertForSequenceClassification', commit_description='', oid='13570c87d637e66e393c2964a4e35e17db27b379', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nojedag/finbert-european', endpoint='https://huggingface.co', repo_type='model', repo_id='nojedag/finbert-european'), pr_revision=None, pr_num=None)

In [25]:
trainer.push_to_hub()
trainer_eval.push_to_hub()

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nojedag/finbert-european/commit/fe1852f45025ae49c202664f693b008febe29017', commit_message='End of training', commit_description='', oid='fe1852f45025ae49c202664f693b008febe29017', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nojedag/finbert-european', endpoint='https://huggingface.co', repo_type='model', repo_id='nojedag/finbert-european'), pr_revision=None, pr_num=None)

In [26]:
wandb.finish()

0,1
eval/loss,▄▄▄▃▂▁▂▁▂▃▃▂▅▅▄▆▆▆██▇▁
eval/model_preparation_time,▁
eval/runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
eval/samples_per_second,██▇██████████████████▁
eval/steps_per_second,██▇██████████████████▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████▁
train/grad_norm,▅██▃▆▁▂
train/learning_rate,▇█▇▅▄▂▁
train/loss,█▆▄▃▂▁▁

0,1
eval/loss,0.65388
eval/model_preparation_time,0.002
eval/runtime,45.4933
eval/samples_per_second,115.577
eval/steps_per_second,14.464
total_flos,2.259512786080973e+16
train/epoch,7.0
train/global_step,0.0
train/grad_norm,5.1894
train/learning_rate,0.0
