In [None]:
!pip install transformers

In [1]:
# Load model directly using AutoTokenizer and AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#tokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
#model
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**Model Description**




>

This model is a distilled version of the RoBERTa-base model. It follows the same training procedure as DistilBERT. The code for the distillation process can be found here. This model is case-sensitive: it makes a difference between English and English.

The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.

Training Data
Polar sentiment dataset of sentences from financial news. The dataset consists of 4840 sentences from English language financial news categorised by sentiment. The dataset is divided by agreement rate of 5-8 annotators.

Training procedure
Training hyperparameters
The following hyperparameters were used during training:

learning_rate: 2e-05
train_batch_size: 8
eval_batch_size: 8
seed: 42
optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
lr_scheduler_type: linear
num_epochs: 5



In [2]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [3]:
device = "cuda"
model = model.to(device)  # switch the model to cuda , is_availiable first to if you have cuda available

In [4]:
# Lets test out the model by feeding the custom text
text = "Operating profit totaled EUR 9.4 mn , down from EUR 11.7 mn in 2004 "

#Tokenization
tokenized_demo = tokenizer(
    text,
    truncation=True,
    return_tensors="pt"  # Return PyTorch tensors
)
#Print the tokenized output
tokenized_demo

{'input_ids': tensor([[    0, 20420,  1295,  1963, 15137, 10353,   361,     4,   306,   475,
           282,  2156,   159,    31, 10353,   365,     4,   406,   475,   282,
            11,  4482,  1437,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Creating Dataset

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Installing collected 

In [4]:
from datasets import load_dataset
df_hug = load_dataset("neeeeellllll/Financial_data_new")
df_hug

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 4673
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 1169
    })
})

In [8]:
df_hug['train']

Dataset({
    features: ['text', 'labels'],
    num_rows: 4673
})

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True) # tokenizer function

tokenized_data = df_hug.map(preprocess_function, batched=True)

Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

In [10]:
tokenized_data['train']

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 4673
})

In [6]:
# Converting hugging face dataset into pytorch dataset using pytorch CustomDataset
import torch
from torch.utils.data import Dataset, DataLoader

# Define your PyTorch dataset
class CustomDataset(Dataset):
    def __init__(self, input_ids,attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'labels': torch.tensor(self.labels[idx]),
        }


In [7]:
#train
input_ids_train = tokenized_data["train"]["input_ids"] # input_ids
attention_mask_train = tokenized_data["train"]["attention_mask"] # attention_mask
labels_train = tokenized_data["train"]["labels"] #label which is saved int the "Sentiment" column of the dataset

#test
input_ids_test = tokenized_data["test"]["input_ids"]
attention_mask_test = tokenized_data["test"]["attention_mask"]
labels_test = tokenized_data["test"]["labels"]

In [8]:
# Create PyTorch datasets
train_dataset = CustomDataset(input_ids_train,attention_mask_train, labels_train)
test_dataset = CustomDataset(input_ids_test,attention_mask_test, labels_test)

In [9]:
# Create PyTorch DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=20, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=20, shuffle=False)

In [None]:
#!pip install evaluate

In [10]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def compute_metrics(p):
    pred_probs, labels = p
    pred = np.argmax(pred_probs, axis=1) #taking out the prediction

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [16]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/270.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m266.2/270.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [11]:
import accelerate
import transformers

print(transformers.__version__)
print(accelerate.__version__)

4.37.2
0.26.1


In [19]:
!pip uninstall transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.35.2.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.35.2
Found existing installation: accelerate 0.26.1
Uninstalling accelerate-0.26.1:
  Would remove:
    /usr/local/bin/accelerate
    /usr/local/bin/accelerate-config
    /usr/local/bin/accelerate-estimate-memory
    /usr/local/bin/accelerate-launch
    /usr/local/lib/python3.10/dist-packages/accelerate-0.26.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/accelerate/*
Proceed (Y/n)? Y
  Successfully uninstalled accelerate-0.26.1
Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelera

In [12]:
from transformers import TrainingArguments, Trainer, AdamW, get_linear_schedule_with_warmup

#Optimizer
learning_rate = 2e-05
num_epoch = 10

total_steps = len(train_dataloader) * num_epoch


optimizer = AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define Trainer
training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=num_epoch,
    learning_rate=2e-5
)



In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader.dataset,
    eval_dataset=test_dataloader.dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [14]:
trainer.train()

Step,Training Loss
500,0.5408
1000,0.3979
1500,0.298
2000,0.2615
2500,0.2161
3000,0.2014
3500,0.2041
4000,0.2047
4500,0.2026
5000,0.1922


TrainOutput(global_step=5850, training_loss=0.2629364339714376, metrics={'train_runtime': 408.2402, 'train_samples_per_second': 114.467, 'train_steps_per_second': 14.33, 'total_flos': 617467687863066.0, 'train_loss': 0.2629364339714376, 'epoch': 10.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.5491074323654175,
 'eval_accuracy': 0.8169375534644996,
 'eval_precision': 0.8054836128359761,
 'eval_recall': 0.8169375534644996,
 'eval_f1': 0.8088968023497544,
 'eval_runtime': 2.043,
 'eval_samples_per_second': 572.195,
 'eval_steps_per_second': 71.953,
 'epoch': 10.0}