# **Financial Impact Analysis Using RoBERTa**

## **Install necessary packages**

In [None]:
!pip install transformers tokenizers huggingface_hub tensorboard==2.11 trl



In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    RobertaTokenizerFast,
    RobertaConfig,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer
)
from pathlib import Path
from huggingface_hub import PyTorchModelHubMixin

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
CUDA SETUP: Loading binary /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so...
dlopen(/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so, 0x0006): tried: '/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o file), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (no such file), '/Library/Frameworks/Python.frame

  warn("The installed version of bitsandbytes was compiled without GPU support. "


## **Preprocessings**

In [4]:
# Load dataset
dataset = pd.read_csv('./forex_factory_dataset.csv')

### Data cleaning

In [5]:
# Define function to remove non-numeric characters
def remove_non_numeric(text):
    pattern = r'[^0-9.]'
    result = re.sub(pattern, '', text)
    return result

def impact_coding(impact):
  if impact == "Low Impact Expected":
    return 0
  elif impact == "Medium Impact Expected":
    return 1
  else:
    return 2

dataset['impact'] = dataset['impact'].map(lambda row: impact_coding(row))
dataset.dropna(subset=['actual_value'], inplace=True)
dataset['forecast_value'].fillna(dataset['previous_value'], inplace=True)
dataset.dropna(inplace=True)


In [6]:
display(dataset)

Unnamed: 0.1,Unnamed: 0,currency,event,impact,previous_value,forecast_value,actual_value
12,12,JPY,Final Manufacturing PMI,0,49.7,49.7,50.0
13,13,CNY,Caixin Manufacturing PMI,0,54.9,54.7,53.0
14,14,AUD,Commodity Prices y/y,0,2.5%,2.5%,11.7%
15,15,EUR,Spanish Manufacturing PMI,0,49.8,52.6,51.0
16,16,CHF,Manufacturing PMI,0,55.2,54.4,58.0
...,...,...,...,...,...,...,...
15178,15178,GBP,30-y Bond Auction,0,2.53|2.2,2.53|2.2,2.36|2.5
15179,15179,USD,NFIB Small Business Index,0,89.5,89.5,89.9
15180,15180,USD,Prelim Nonfarm Productivity q/q,0,-7.3%,-4.6%,-4.6%
15181,15181,USD,Prelim Unit Labor Costs q/q,0,12.6%,9.4%,10.8%


In [7]:
condition_mask = dataset['previous_value'].str.contains("|",regex=False)
dataset = dataset[~condition_mask]
dataset = dataset.reset_index(drop=True)

In [8]:
display(dataset)

Unnamed: 0.1,Unnamed: 0,currency,event,impact,previous_value,forecast_value,actual_value
0,12,JPY,Final Manufacturing PMI,0,49.7,49.7,50.0
1,13,CNY,Caixin Manufacturing PMI,0,54.9,54.7,53.0
2,14,AUD,Commodity Prices y/y,0,2.5%,2.5%,11.7%
3,15,EUR,Spanish Manufacturing PMI,0,49.8,52.6,51.0
4,16,CHF,Manufacturing PMI,0,55.2,54.4,58.0
...,...,...,...,...,...,...,...
11696,15177,JPY,Prelim Machine Tool Orders y/y,0,17.1%,17.1%,5.5%
11697,15179,USD,NFIB Small Business Index,0,89.5,89.5,89.9
11698,15180,USD,Prelim Nonfarm Productivity q/q,0,-7.3%,-4.6%,-4.6%
11699,15181,USD,Prelim Unit Labor Costs q/q,0,12.6%,9.4%,10.8%


In [9]:
dataset['previous_value'] = dataset['previous_value'].apply(remove_non_numeric)
dataset['forecast_value'] = dataset['forecast_value'].apply(remove_non_numeric)
dataset['actual_value'] = dataset['actual_value'].apply(remove_non_numeric)

In [10]:
X = dataset.drop(columns=["impact"])
y = dataset["impact"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Split the data into test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5,random_state=42)

In [11]:
X_train, y_train = X_train.reset_index(drop=True), y_train.reset_index(drop=True)
X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index(drop=True)
X_val, y_val = X_val.reset_index(drop=True), y_val.reset_index(drop=True)

In [13]:
display(y_train)

0       0
1       0
2       0
3       2
4       1
       ..
9355    0
9356    0
9357    0
9358    0
9359    0
Name: impact, Length: 9360, dtype: int64

## **Data Loader**

In [14]:
# Define a custom dataset
class FinancialNewsData(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = str(self.X.iloc[idx]['event'])
        target = self.y.iloc[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': target
        }


## **Model Definition**

In [16]:
# Define your custom model architecture

class FinancialNewsImpactPredictor(nn.Module, PyTorchModelHubMixin):
    def __init__(self, num_classes=3):
      super(FinancialNewsImpactPredictor,self).__init__()
      config = RobertaConfig()
      config.num_labels = num_classes
      self.roberta = RobertaForSequenceClassification(config=config)

    def forward(self, input_ids, attention_mask):
      output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
      return torch.softmax(output.logits, dim=1)


## **Training**

### Set tokenizer and dataset

In [17]:
# Select hyperparameters
NUM_EPOCHS = 3
LEARNING_RATE = 1e-5
BATCH_SIZE = 16

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

train_dataset = FinancialNewsData(X_train, y_train, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = FinancialNewsData(X_val, y_val, tokenizer, max_length=128)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

### Set model

In [18]:
model = FinancialNewsImpactPredictor()

### Set trainer

In [19]:
# Select criterion and optimizers
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(),lr=LEARNING_RATE)

### **Train**

### Train and evaluation functions

In [20]:
def train(model, train_loader, criterion, optimizer, device, pbar):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Update the tqdm progress bar
        pbar.update(1)
        pbar.set_postfix({'loss': loss.item()})

    return running_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    total_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    return total_loss / len(val_loader), total_correct / total_samples


### Traning

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(NUM_EPOCHS):
  with tqdm(total=len(train_loader), desc=f'Epoch {epoch+1}/{NUM_EPOCHS}', unit='batch') as pbar:
    train_loss = train(model, train_loader, criterion, optimizer, device, pbar)

  val_loss, val_acc = evaluate(model, val_loader, criterion, device)
  print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

Epoch 1/3: 100%|██████████| 585/585 [15:36<00:00,  1.60s/batch, loss=0.989]


Epoch 1/3, Train Loss: 0.7865, Val Loss: 0.7793, Val Acc: 0.7771


Epoch 2/3: 100%|██████████| 585/585 [16:28<00:00,  1.69s/batch, loss=0.989]  


Epoch 2/3, Train Loss: 0.7854, Val Loss: 0.7756, Val Acc: 0.7771


Epoch 3/3: 100%|██████████| 585/585 [15:16<00:00,  1.57s/batch, loss=0.739]


Epoch 3/3, Train Loss: 0.7853, Val Loss: 0.7719, Val Acc: 0.7771


## **Evaluate**

In [22]:
test_dataset = FinancialNewsData(X_test, y_test, tokenizer, max_length=128)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loss, test_acc = evaluate(model, val_loader, criterion, device)
print(f"Loss: {test_loss} Accuracy: {test_acc}")

Loss: 0.7719000376559593 Accuracy: 0.7771135781383433


## **Save Model**

In [62]:
filename = Path('models')
filename.mkdir(parents=True,exist_ok=True)

model_name='roberta-financial-news-impact-analysis.bin' # model name


# saving path

saving_path = filename/model_name
print(saving_path)
torch.save(obj=model.state_dict(),f=saving_path)

models/roberta-financial-news-impact-analysis.bin


## **Make Inference**

In [14]:
label_mapping = {
    0: "Low",
    1: "Medium",
    2: "High"
}

### From local saved

In [20]:
model_path = "./models/model_v1/pytorch_model.bin"
model = FinancialNewsImpactPredictor()
model.load_state_dict(torch.load(model_path))

input_text = "German Buba President Nagel Speaks"
encoding = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
input_ids =  encoding['input_ids'].flatten()
attention_mask = encoding['attention_mask'].flatten()
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)

output = model(input_ids,attention_mask)
predicted_class_index = torch.argmax(output)
predicted_label = label_mapping[predicted_class_index.item()]
print("Predicted Impact:", predicted_label)

Predicted Impact: Low


### From HuggingFace

In [13]:
from transformers import AutoModelForSequenceClassification
from transformers import RobertaTokenizerFast
import torch


MODEL = "nusret35/roberta-financial-news-impact-analysis"
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

input_text = "German Buba President Nagel Speaks"
encoding = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
input_ids =  encoding['input_ids'].flatten()
attention_mask = encoding['attention_mask'].flatten()
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)

output = model(input_ids,attention_mask)
predicted_class_index = torch.argmax(output.logits)
predicted_label = label_mapping[predicted_class_index.item()]
print("Predicted Impact:", predicted_label)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at nusretkizilaslan/roberta-financial-news-impact-analysis and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.word_embeddings.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.va

Predicted Impact: Low
