In [1]:
import logging

from transformers import AutoTokenizer
import json
import tqdm
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
import logging
from transformers import BartTokenizer, BartModel
from sklearn.preprocessing import LabelBinarizer
from pytorch_lightning import Trainer
import pandas as pd
import numpy as np
import re

# Huggingface transformers
import transformers
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#handling html data
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

  rank_zero_deprecation(


In [2]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

In [3]:
base_path = '/home/chaofeng/Documents/vscode/finsim/data/kg'
trainfile = base_path + "/train.json"
testfile = base_path + '/test.json'
raw_datasets = pd.read_json(trainfile)
test_datasets = pd.read_json(testfile)

In [4]:
mlb = LabelBinarizer()
y = raw_datasets.label.tolist()
yt = mlb.fit_transform(y)
x = raw_datasets.term.tolist()
x_train,x_test,y_train,y_test = train_test_split(x, yt , test_size=0.1, random_state=42,shuffle=True)

x_tr,x_val,y_tr,y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42,shuffle=True)
testset = test_datasets.term.tolist()

In [5]:
class FinsimDataset (Dataset):
    def __init__(self,quest,tags, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = quest
        self.labels = tags
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True, # Add [CLS] [SEP]
            max_length= self.max_len,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True, # Differentiates padded vs normal token
            truncation=True, # Truncate data beyond max length
            return_tensors = 'pt' # PyTorch Tensor format
          )
        
        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()
        #token_type_ids = inputs["token_type_ids"]
        
        return {
            'input_ids': input_ids ,
            'attention_mask': attn_mask,
            'label': torch.tensor(self.labels[item_idx], dtype=torch.float)
            
        }

In [6]:
class FinsimDataModule (pl.LightningDataModule):    
    def __init__(self,x_tr,y_tr,x_val,y_val,x_test,y_test,tokenizer,batch_size=16,max_token_len=200):
        super().__init__()
        self.tr_text = x_tr
        self.tr_label = y_tr
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self):
        self.train_dataset = FinsimDataset(quest=self.tr_text,  tags=self.tr_label,tokenizer=self.tokenizer,max_len= self.max_token_len)
        self.val_dataset= FinsimDataset(quest=self.val_text, tags=self.val_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.test_dataset =FinsimDataset(quest=self.test_text, tags=self.test_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size= self.batch_size, shuffle = True , num_workers=4)

    def val_dataloader(self):
        return DataLoader (self.val_dataset,batch_size= 16)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size= 16)

In [7]:

BERT_MODEL_NAME = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
N_EPOCHS = 20
BATCH_SIZE = 32
MAX_LEN = 100
LR = 2e-04
Bert_tokenizer = tokenizer
datamodule = FinsimDataModule(x_tr,y_tr,x_val,y_val,x_test,y_test,Bert_tokenizer,BATCH_SIZE,MAX_LEN)
datamodule.setup()


In [8]:
class FinsimClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self,n_classes=17,steps_per_epoch=None,n_epochs=3, lr=2e-5):
        super().__init__()

        self.bert=BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        # self.bert = BartModel.from_pretrained('facebook/bart-large', return_dict=True)
        # self.bilstm = nn.LSTM(self.bert.config.hidden_size, 256, bidirectional=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self,input_ids, attn_mask):
        output = self.bert(input_ids=input_ids,attention_mask=attn_mask)   
        # output, _ = self.bilstm(output.last_hidden_state)
        # avg_pool = torch.mean(output, 1)
        output = self.classifier(output.pooler_output)    
        return output

    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]

In [9]:
steps_per_epoch = len(x_tr)//BATCH_SIZE
model = FinsimClassifier(n_classes=17, steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',# monitored quantity
    filename='QTag-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3, #  save the top 3 models
    mode='min', # mode of the monitored quantity  for optimization
)

trainer = Trainer(max_epochs = N_EPOCHS , gpus = 1, callbacks=[checkpoint_callback],progress_bar_refresh_rate = 30)
trainer.fit(model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 109 M 
1 | classifier | Linear            | 13.1 K
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.981   Total estimated model params size (MB)
  rank_zero_warn(
Epoch 0:  83%|████████▎ | 30/36 [00:04<00:00,  6.63it/s]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/12 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 36/36 [00:04<00:00,  7.40it/s, loss=0.754, v_num=73, val_loss=0.756, train_loss=0.751]
Epoch 1:  83%|████████▎ | 30/36 [00:04<00:00,  6.72it/s, loss=0.754, v_num=73, val_loss=0.756, train_loss=0.751]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/12 [00:00<?, ?it/s][A
E

In [10]:
model_path = checkpoint_callback.best_model_path
print(model_path)

trainer.test(model,datamodule=datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/chaofeng/Documents/vscode/Finsim-1/models/lightning_logs/version_73/checkpoints/QTag-epoch=15-val_loss=0.04.ckpt
  rank_zero_warn(
Testing: 100%|██████████| 7/7 [00:00<00:00, 37.02it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.04577893391251564}
--------------------------------------------------------------------------------


[{'test_loss': 0.04577893391251564}]

In [11]:
model_path

'/home/chaofeng/Documents/vscode/Finsim-1/models/lightning_logs/version_73/checkpoints/QTag-epoch=15-val_loss=0.04.ckpt'

In [12]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

Reusing TensorBoard on port 6006 (pid 9228), started 10:15:49 ago. (Use '!kill 9228' to kill it.)

In [13]:
model_path = checkpoint_callback.best_model_path
model_path

'/home/chaofeng/Documents/vscode/Finsim-1/models/lightning_logs/version_73/checkpoints/QTag-epoch=15-val_loss=0.04.ckpt'

In [14]:
len(y_test), len(x_test)

(105, 105)

In [15]:
from torch.utils.data import TensorDataset

# Tokenize all questions in x_test
input_ids = []
attention_masks = []


for quest in x_test:
    encoded_quest =  Bert_tokenizer.encode_plus(
                    quest,
                    None,
                    add_special_tokens=True,
                    max_length= MAX_LEN,
                    padding = 'max_length',
                    return_token_type_ids= False,
                    return_attention_mask= True,
                    truncation=True,
                    return_tensors = 'pt'      
    )
    
    # Add the input_ids from encoded question to the list.    
    input_ids.append(encoded_quest['input_ids'])
    # Add its attention mask 
    attention_masks.append(encoded_quest['attention_mask'])
    
# Now convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_test)

# Set the batch size.  
TEST_BATCH_SIZE = 64  

# Create the DataLoader.
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=TEST_BATCH_SIZE)

In [16]:
flat_pred_outs = 0
flat_true_labels = 0

In [17]:
model = model.to('cuda') # moving model to cuda
model.eval()

FinsimClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [18]:
pred_outs, true_labels = [], []

In [19]:
device = 'cuda'

In [20]:
for batch in pred_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_attn_mask, b_labels = batch
 
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        pred_out = model(b_input_ids,b_attn_mask)
        pred_out = torch.sigmoid(pred_out)
        # Move predicted output and labels to CPU
        pred_out = pred_out.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        #i+=1
        # Store predictions and true labels
        #print(i)
        #print(outputs)
        #print(logits)
        #print(label_ids)
    pred_outs.append(pred_out)
    true_labels.append(label_ids)

In [21]:
pred_outs[0][0]

array([0.00229858, 0.00220319, 0.00157888, 0.00239198, 0.00238328,
       0.991137  , 0.00234041, 0.00328559, 0.00288465, 0.00237044,
       0.00283643, 0.0017684 , 0.0039663 , 0.00224881, 0.00256941,
       0.00202786, 0.00247595], dtype=float32)

In [22]:
# Combine the results across all batches. 
flat_pred_outs = np.concatenate(pred_outs, axis=0)

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [23]:
flat_pred_outs.shape , flat_true_labels.shape

((105, 17), (105, 17))

In [24]:
#define candidate threshold values
threshold  = np.arange(0.4,0.51,0.01)
threshold

array([0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 ])

In [25]:
def classify(pred_prob,thresh):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        for tag_label in tag_label_row:
            if tag_label >= thresh:
                temp.append(1) # Infer tag value as 1 (present)
            else:
                temp.append(0) # Infer tag value as 0 (absent)
        y_pred.append(temp)

    return y_pred

In [26]:
flat_pred_outs[3]

array([0.00232111, 0.00217112, 0.00156825, 0.00239126, 0.0023779 ,
       0.99104965, 0.0023092 , 0.00334024, 0.00282741, 0.00233517,
       0.00280103, 0.00175893, 0.00385609, 0.00221766, 0.00252874,
       0.00202016, 0.00252985], dtype=float32)

In [27]:
flat_true_labels[3]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [28]:
from sklearn import metrics
scores=[] # Store the list of f1 scores for prediction on each threshold

#convert labels to 1D array
y_true = flat_true_labels.ravel() 

for thresh in threshold:
    
    #classes for each threshold
    pred_bin_label = classify(flat_pred_outs,thresh) 

    #convert to 1D array
    y_pred = np.array(pred_bin_label).ravel()

    scores.append(metrics.f1_score(y_true,y_pred))

In [29]:
opt_thresh = threshold[scores.index(max(scores))]
print(f'Optimal Threshold Value = {opt_thresh}')

Optimal Threshold Value = 0.4800000000000001


In [30]:
#predictions for optimal threshold
y_pred_labels = classify(flat_pred_outs,opt_thresh)
y_pred = np.array(y_pred_labels).ravel() # Flatten

In [31]:
print(metrics.classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1680
           1       0.95      0.88      0.91       105

    accuracy                           0.99      1785
   macro avg       0.97      0.94      0.95      1785
weighted avg       0.99      0.99      0.99      1785



In [32]:
y_pred = mlb.inverse_transform(np.array(y_pred_labels))
y_act = mlb.inverse_transform(flat_true_labels)

df = pd.DataFrame({'Body':x_test,'Actual Tags':y_act,'Predicted Tags':y_pred})

In [33]:

df.sample(10)

Unnamed: 0,Body,Actual Tags,Predicted Tags
76,S&P Global Inc. US-NY,Stock Corporation,Stock Corporation
71,Caja de Valores S.A.,Central Securities Depository,Central Securities Depository
83,Dow Jones Asian Titans 50 Index,Equity Index,Equity Index
10,Latin America Corporate Bond,Credit Index,Bonds
97,North American Corporate,Credit Index,Credit Index
51,MSCI EMU IMI U.S. Dollar Hedged Index,Equity Index,Equity Index
16,Andorran Financial Authority,Regulatory Agency,Regulatory Agency
33,Barbados Central Securities Depository Inc.,Central Securities Depository,Central Securities Depository
67,Herausgebergemeinschaft Wertpapier-Mitteilunge...,Stock Corporation,Bonds
79,STOXX Europe Select Dividend 30 Index,Equity Index,Equity Index


In [34]:
QTmodel = FinsimClassifier.load_from_checkpoint(model_path)
QTmodel.eval()

FinsimClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [35]:
def predict(question):
    text_enc = Bert_tokenizer.encode_plus(
            question,
            None,
            add_special_tokens=True,
            max_length= MAX_LEN,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'      
    )
    outputs = QTmodel(text_enc['input_ids'], text_enc['attention_mask'])
    pred_out = outputs[0].detach().numpy()
    #print(f'Outputs = {outputs}')
    #print(f'Type = {type(outputs)}')
    #print(f'Pred Outputs = {pred_out}')
    #print(f'Type = {type(pred_out)}')
    #preds = np.round(pred_out)
    preds = [(pred > opt_thresh) for pred in pred_out ]
    #pred_list = [ round(pred) for pred in pred_logits ]
    preds = np.asarray(preds)
    #print(f'Predictions = {preds}')
    #print(f'Type = {type(preds)}')
    #print(mlb.classes_)
    new_preds = preds.reshape(1,-1).astype(int)
    #print(new_preds)
    pred_tags = mlb.inverse_transform(new_preds)
    #print(mlb.inverse_transform(np.array(new_preds)))
    return pred_tags

In [1]:
outputs = []
par = tqdm.tqdm(total=len(testset), ncols=100)
with open(base_path+'/finbert1.json', 'w') as f:    
    for i, term in enumerate(testset):
        output = {}
        par.update(1)
        output['term'] = term
        # output['label'] = y[i]
        output['predicted_labels'] = [predict(term)[0]]
        outputs.append(output)
    json.dump(outputs, f, indent=4)

NameError: name 'tqdm' is not defined

In [41]:
par.close()

100%|███████████████████████████████████████████████████████████| 1050/1050 [02:41<00:00,  6.51it/s]


In [38]:
#testset