In [3]:
import warnings
warnings.filterwarnings('ignore') # to avoid warnings

import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys 

"""
Sklearn Libraries
"""
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

"""
Transformer Libraries
"""
from transformers import BertTokenizer,  AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
"""
Pytorch Libraries
"""
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

sys.path.append('../')
from src.utilities.config_ import train_data_path
import src.utilities.utils as utils

In [50]:
# read csv
data = pd.read_csv(os.path.join(train_data_path, "finance-dataset.csv"),
                   encoding='latin-1', 
                    names=['label', 'text']).iloc[1:]

# Convert labels to integers
label_to_int = {'positive': 0, 'neutral': 1, 'negative': 2}
data['label'] = data['label'].map(label_to_int)

data

Unnamed: 0,label,text
1,1,"According to Gran , the company has no plans t..."
2,1,Technopolis plans to develop in stages an area...
3,2,The international electronic industry company ...
4,0,With the new production plant the company woul...
5,0,According to the company 's updated strategy f...
...,...,...
4842,2,LONDON MarketWatch -- Share prices ended lower...
4843,1,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4844,2,Operating profit fell to EUR 35.4 mn from EUR ...
4845,2,Net sales of the Paper segment decreased to EU...


In [51]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# Load the FinBERT model
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

In [53]:
# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)  # You can adjust the max_length as needed

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Rename the label column to "labels" for the trainer
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Remove unnecessary columns
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# Convert to torch tensors
tokenized_dataset.set_format("torch")

# Split the dataset into train and test sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

Map: 100%|██████████| 4846/4846 [00:01<00:00, 2597.49 examples/s]


In [54]:
# Define training arguments with evaluation steps
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,  # Set a high number of epochs to allow for early stopping
    per_device_train_batch_size=8,  # Adjust based on your memory
    per_device_eval_batch_size=16,  # Adjust based on your memory
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,  # Log every 100 steps
    evaluation_strategy="steps",  # Evaluate every 'eval_steps'
    eval_steps=500,  # Evaluation interval, adjust based on your dataset size
    log_level="error",
)

In [55]:
from transformers import TrainerCallback, TrainerState, TrainerControl
import numpy as np

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, threshold: float, metric: str = "eval_accuracy"):
        self.threshold = threshold
        self.metric = metric

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.log_history and self.metric in state.log_history[-1]:
            accuracy = state.log_history[-1][self.metric]
            if accuracy > self.threshold:
                control.should_training_stop = True
                print(f"Stopping training as {self.metric} has reached {accuracy}")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define the custom callback with your threshold
early_stopping_callback = EarlyStoppingCallback(threshold=0.85, metric="eval_accuracy")

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics,  # Define a compute_metrics function to calculate accuracy
)

# Train the model
trainer.train()

{'loss': 4.4521, 'learning_rate': 1e-05, 'epoch': 0.21}
{'loss': 0.7295, 'learning_rate': 2e-05, 'epoch': 0.41}
{'loss': 0.5691, 'learning_rate': 3e-05, 'epoch': 0.62}
{'loss': 0.6397, 'learning_rate': 4e-05, 'epoch': 0.82}
{'loss': 0.5958, 'learning_rate': 5e-05, 'epoch': 1.03}
{'eval_loss': 0.5578722357749939, 'eval_accuracy': 0.8072164948453608, 'eval_runtime': 50.225, 'eval_samples_per_second': 19.313, 'eval_steps_per_second': 1.215, 'epoch': 1.03}
{'loss': 0.4561, 'learning_rate': 4.652777777777778e-05, 'epoch': 1.24}
{'loss': 0.4593, 'learning_rate': 4.305555555555556e-05, 'epoch': 1.44}
{'loss': 0.4638, 'learning_rate': 3.958333333333333e-05, 'epoch': 1.65}
{'loss': 0.417, 'learning_rate': 3.611111111111111e-05, 'epoch': 1.86}
{'loss': 0.3437, 'learning_rate': 3.263888888888889e-05, 'epoch': 2.06}
{'eval_loss': 0.7420837879180908, 'eval_accuracy': 0.8309278350515464, 'eval_runtime': 40.4004, 'eval_samples_per_second': 24.01, 'eval_steps_per_second': 1.51, 'epoch': 2.06}
{'loss':

TrainOutput(global_step=1500, training_loss=0.6723511295318604, metrics={'train_runtime': 2168.8505, 'train_samples_per_second': 7.148, 'train_steps_per_second': 0.894, 'train_loss': 0.6723511295318604, 'epoch': 3.09})

In [56]:
# Predict on the test set
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)


In [57]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the metric function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Evaluate the model
metrics = compute_metrics(predictions)
print(metrics)

{'accuracy': 0.8525773195876288, 'f1': 0.852897273228443, 'precision': 0.8533935043857552, 'recall': 0.8525773195876288}


In [46]:
test_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 970
})

In [58]:
# Your new text
new_text = ["This is a sample financial news article. The market is looking bad.", "Market is looking great now"]

# Tokenize the new text
inputs = tokenizer(new_text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")

# Move inputs to the appropriate device (e.g., CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Make predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Map predictions to labels
int_to_label = {0: 'positive', 1: 'neutral', 2: 'negative'}
predicted_labels = [int_to_label[pred.item()] for pred in predictions]

print(predicted_labels)


['negative', 'positive']


In [59]:
# Define the path to save the model and tokenizer
save_directory = "../model/finbert/"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

('../model/finbert/tokenizer_config.json',
 '../model/finbert/special_tokens_map.json',
 '../model/finbert/vocab.txt',
 '../model/finbert/added_tokens.json')

In [62]:
# Define the path to the saved directory
save_directory = "../model/finbert/"

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(save_directory)

# Load the model
model = BertForSequenceClassification.from_pretrained(save_directory)

In [63]:
# Define dummy training arguments (only `per_device_eval_batch_size` is relevant here)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,  # Adjust based on your memory
)

# Create Trainer instance with the loaded model
trainer = Trainer(
    model=model,
    args=training_args
)

# Predict on the test dataset
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [64]:
from sklearn.metrics import classification_report, accuracy_score


# Extract the true labels
true_labels = predictions.label_ids

# Generate the classification report
report = classification_report(true_labels, preds, target_names=['negative', 'neutral', 'positive'])
accuracy = accuracy_score(true_labels, preds)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.8525773195876288
Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.80      0.80       271
     neutral       0.89      0.88      0.88       583
    positive       0.81      0.84      0.83       116

    accuracy                           0.85       970
   macro avg       0.83      0.84      0.84       970
weighted avg       0.85      0.85      0.85       970

