# BERT For Financial Sentiment Analysis 
This notebooks shows how to train and use the BERT pre-trained language model for financial sentiment analysis.

## Install Denpendencies

In [1]:
!pip install -q transformers 
!pip install -q torch
!pip install -q nltk 
!pip install transformers[torch]
!pip install accelerate -U



In [2]:
!pip install datasets



## Imports

In [3]:
from pathlib import Path
import shutil
import os
import logging
import torch
import sys
import pandas as pd
from pprint import pprint
from sklearn.metrics import classification_report
from transformers import AutoModelForSequenceClassification , AutoTokenizer
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import load_metric
import numpy as np

%load_ext autoreload
%autoreload 2



In [4]:
import warnings
warnings.filterwarnings("ignore")

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)


project_dir = Path.cwd().parent

## Prepare Train & Test Data 

In [5]:
%run ./../scripts/datasets.py --data_path ../'data'/'sentiment_data'/'Sentences_50Agree.txt'

In [6]:
cl_data_path = project_dir/'data'/'sentiment_data'
train = pd.read_csv(os.path.join(cl_data_path, 'train.csv'), sep='\t', index_col=False)
eval = pd.read_csv(os.path.join(cl_data_path, 'test.csv'), sep='\t', index_col=False)

In [7]:
train.shape

(3488, 3)

In [8]:
eval.shape

(970, 3)

In [9]:
train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,1950,"After the reporting period , BioTie North Amer...",positive
1,4283,They will cover all Forest Industry 's units a...,negative
2,3014,"( ADP News ) - Nov 28 , 2008 - Finnish power-s...",positive
3,4097,"Following the transaction , Lundbeck has world...",positive
4,2733,A few employees would remain at the Oulu plant...,neutral


In [10]:
train.to_csv(os.path.join(cl_data_path, "train_subset.csv"), index=False)
eval.to_csv(os.path.join(cl_data_path, "eval.csv"), index=False)

In [11]:
dataset = load_dataset('csv', data_files={'train': os.path.join(cl_data_path, 'train_subset.csv'), 'eval': os.path.join(cl_data_path, 'eval.csv')})

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 3488
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 970
    })
})

## Evaluate bert-base-uncased model without finetunning with financial corpus 

### Load bert-base-uncased 

In [13]:
MODEL = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [14]:
def transform_labels(label):

    label = label['label']
    num = 0
    if label == 'negative': #'Negative'
        num = 0
    elif label == 'neutral': #'Neutral'
        num = 1
    elif label == 'positive': #'Positive'
        num = 2

    return {'labels': num}

# Defining a function to tokenize text
def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length')

# Change the tweets to tokens that the models can exploit
dataset = dataset.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['Unnamed: 0','label','text']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/3488 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

Map:   0%|          | 0/3488 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3488
    })
    eval: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 970
    })
})

In [16]:
#Load the pretrained model
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluate bert-base-uncased model 

In [17]:
train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['eval'].shuffle(seed=10)


In [18]:
training_args = TrainingArguments(
                output_dir='./results', #output directory
                num_train_epochs=3,  #Total number of training epochs to perform
                per_device_train_batch_size=8, #Batch size for device during training
                per_device_eval_batch_size= 8, #Batch size for evaluation
                evaluation_strategy = 'epoch', #Evaluation is done at the end of each epoch
                eval_steps=100,
                save_strategy='epoch', #save at the end of each epoch
                save_steps=100,
                warmup_steps= 500, #Number of steps used for a linear warmup from 0 to learning_rate
                learning_rate = 5e-6, #learining rate
                seed=42,
                weight_decay = 0.01, # the weight decay value
                logging_strategy='epoch',
                logging_dir = './logs',
                logging_steps =100,
                load_best_model_at_end=True, #Whether or not to load the best model found during training at the end of training
                )

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [20]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [21]:
# Launch the evaluation 
eval_bert_base_uncased = trainer.evaluate()


In [22]:
eval_bert_base_uncased

{'eval_loss': 1.0184953212738037,
 'eval_accuracy': 0.5855670103092784,
 'eval_precision': 0.4603280029270603,
 'eval_recall': 0.5855670103092784,
 'eval_f1': 0.4417719311986905,
 'eval_runtime': 28.2116,
 'eval_samples_per_second': 34.383,
 'eval_steps_per_second': 4.324}

### bert_base_uncased LLM is not efficient for finacial data sentiment envaluation 

## Fine Tune bert_base_uncased model with Financial Corpus Data



### Train bert_base_uncased model with Financial Corpus 

In [23]:
#model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)

In [24]:
#Specify training arguments
training_args = TrainingArguments(
                output_dir='./results', #output directory
                num_train_epochs=10,  #Total number of training epochs to perform
                per_device_train_batch_size=8, #Batch size for device during training
                per_device_eval_batch_size= 8, #Batch size for evaluation
                evaluation_strategy = 'epoch', #Evaluation is done at the end of each epoch
                eval_steps=100,
                save_strategy='epoch', #save at the end of each epoch
                save_steps=100,
                warmup_steps= 500, #Number of steps used for a linear warmup from 0 to learning_rate
                learning_rate = 5e-6, #learining rate
                seed=42,
                weight_decay = 0.01, # the weight decay value
                logging_strategy='epoch',
                logging_dir = './logs',
                logging_steps =100,
                load_best_model_at_end=True, #Whether or not to load the best model found during training at the end of training
                )

In [25]:
trainer = Trainer(
    model=base_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

In [26]:
# Launch the learning process: training 
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8983,0.724583
2,0.5638,0.431989
3,0.3473,0.444492
4,0.2468,0.495203
5,0.1775,0.60669
6,0.1335,0.639647
7,0.1029,0.699017
8,0.0783,0.732332
9,0.061,0.766704
10,0.0461,0.787283


TrainOutput(global_step=4360, training_loss=0.26554091348560577, metrics={'train_runtime': 3537.5446, 'train_samples_per_second': 9.86, 'train_steps_per_second': 1.232, 'total_flos': 9177396010352640.0, 'train_loss': 0.26554091348560577, 'epoch': 10.0})

### Evaluate tuned model 

In [27]:
training_args = TrainingArguments(
                output_dir='./results', #output directory
                num_train_epochs=3,  #Total number of training epochs to perform
                per_device_train_batch_size=8, #Batch size for device during training
                per_device_eval_batch_size= 8, #Batch size for evaluation
                evaluation_strategy = 'epoch', #Evaluation is done at the end of each epoch
                eval_steps=100,
                save_strategy='epoch', #save at the end of each epoch
                save_steps=100,
                warmup_steps= 500, #Number of steps used for a linear warmup from 0 to learning_rate
                learning_rate = 1e-5, #learining rate
                seed=42,
                weight_decay = 0.01, # the weight decay value
                logging_strategy='epoch',
                logging_dir = './logs',
                logging_steps =100,
                load_best_model_at_end=True, #Whether or not to load the best model found during training at the end of training
                )

In [28]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [29]:

# Launch the final evaluation 
trainer.evaluate()
     

{'eval_loss': 0.43198877573013306,
 'eval_accuracy': 0.8371134020618557,
 'eval_precision': 0.8367029624243868,
 'eval_recall': 0.8371134020618557,
 'eval_f1': 0.8340533704688193,
 'eval_runtime': 29.1127,
 'eval_samples_per_second': 33.319,
 'eval_steps_per_second': 4.191}

### Training takes close to 56 minues , Lets try to optimize training time and resurces 

### Lets fine-tune only subset of model not whole model  , use low learing rate to  overcome the catastrophic forgetting problem

In [30]:
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)

freeze = 6
    
for i in range(freeze):
    for param in base_model.bert.encoder.layer[i].parameters():
        param.requires_grad = False


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
#Specify training arguments
training_args = TrainingArguments(
                output_dir='./results', #output directory
                num_train_epochs=10,  #Total number of training epochs to perform
                per_device_train_batch_size=8, #Batch size for device during training
                per_device_eval_batch_size= 8, #Batch size for evaluation
                evaluation_strategy = 'epoch', #Evaluation is done at the end of each epoch
                eval_steps=100,
                save_strategy='epoch', #save at the end of each epoch
                save_steps=100,
                warmup_steps= 500, #Number of steps used for a linear warmup from 0 to learning_rate
                learning_rate = 5e-6, #learining rate
                seed=42,
                weight_decay = 0.01, # the weight decay value
                logging_strategy='epoch',
                logging_dir = './logs',
                logging_steps =100,
                load_best_model_at_end=True, #Whether or not to load the best model found during training at the end of training
                )

In [32]:
trainer = Trainer(
    model=base_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

In [33]:
# Launch the learning process: training 
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.9381,0.735188
2,0.6204,0.460285
3,0.3986,0.406355
4,0.3196,0.393945
5,0.2736,0.438548
6,0.2369,0.486243
7,0.1981,0.498596
8,0.1823,0.540372
9,0.1574,0.568295
10,0.1501,0.569408


TrainOutput(global_step=4360, training_loss=0.34748814609072626, metrics={'train_runtime': 3040.4642, 'train_samples_per_second': 11.472, 'train_steps_per_second': 1.434, 'total_flos': 9177396010352640.0, 'train_loss': 0.34748814609072626, 'epoch': 10.0})

### Evaluate tuned model 

In [34]:
training_args = TrainingArguments(
                output_dir='./results', #output directory
                num_train_epochs=3,  #Total number of training epochs to perform
                per_device_train_batch_size=8, #Batch size for device during training
                per_device_eval_batch_size= 8, #Batch size for evaluation
                evaluation_strategy = 'epoch', #Evaluation is done at the end of each epoch
                eval_steps=100,
                save_strategy='epoch', #save at the end of each epoch
                save_steps=100,
                warmup_steps= 500, #Number of steps used for a linear warmup from 0 to learning_rate
                learning_rate = 5e-6, #learining rate
                seed=42,
                weight_decay = 0.01, # the weight decay value
                logging_strategy='epoch',
                logging_dir = './logs',
                logging_steps =100,
                load_best_model_at_end=True, #Whether or not to load the best model found during training at the end of training
                )

In [35]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [36]:

# Launch the final evaluation 
trainer.evaluate()

{'eval_loss': 0.39394500851631165,
 'eval_accuracy': 0.8608247422680413,
 'eval_precision': 0.8607818718150227,
 'eval_recall': 0.8608247422680413,
 'eval_f1': 0.8595835760041375,
 'eval_runtime': 28.3802,
 'eval_samples_per_second': 34.179,
 'eval_steps_per_second': 4.299}