### Data Processing

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from glob import glob
from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np

In [2]:
table_names = ['income statement', 'balance sheet', 'cash flow']
texts_lst = []
tags_lst = []
max_len_variable = 10
for path in glob(r"C:\Users\Manohar\Desktop\Projects\Finance-Extraction\data\current\*\*\tagged_tables.xlsx"):
    for t_name in table_names:
        try:
            df = pd.read_excel(path, sheet_name=t_name)
        except ValueError:
            continue
        df = df.dropna(subset=["Unnamed: 0"])
        tags = [""] + df['Tag'].fillna("").to_list()
        if len("".join(tags).strip()) == 0:
            continue
        texts = []
        for _t in df['Unnamed: 0']:
            texts.append(" ".join(_t.split()[:max_len_variable]))
        texts_lst.append([t_name] + texts)
        tags_lst.append(tags)

In [3]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [4]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
_d = []
for ls in tags_lst:
    _d += ls
label_encoder = label_encoder.fit(_d)

In [5]:
def get_tokenized_data(text_lst, tags, max_length, max_variables_count):
    tokenizer_output = {
        'variables': [],
        'tags': torch.tensor(label_encoder.transform([""] + tags + [""]))
    }
    for i, text in enumerate(["<start>"] + text_lst + ["<end>"]):
        tok_out = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, 
                            return_tensors="pt")
        for key in tok_out:
            tok_out[key] = tok_out[key].flatten()
        tokenizer_output['variables'].append(tok_out)
    
    # making all tables of same length
    for _ in range(len(tokenizer_output['variables']), max_variables_count):
        tok_out = tokenizer("", padding='max_length', truncation=True, max_length=max_length, 
                            return_tensors="pt")
        for key in tok_out:
            tok_out[key] = tok_out[key].flatten()
        tokenizer_output['variables'].append(tok_out)
        tokenizer_output['tags'] = torch.cat((tokenizer_output['tags'], torch.tensor([0])))
    
    return tokenizer_output

In [6]:
class CustomDataset(Dataset):
    def __init__(self, texts_lst, tags_lst, max_len_variable, max_variables_count):
        self.max_variables_count = max_variables_count
        self.texts_lst = texts_lst
        self.tags_lst = tags_lst
        self.max_len_variable = max_len_variable
    
    def __len__(self):
        return len(self.texts_lst)
    
    def __getitem__(self, idx):
        data = get_tokenized_data(self.texts_lst[idx], self.tags_lst[idx], 
                                  self.max_len_variable, self.max_variables_count)
        return data

In [7]:
dataset = CustomDataset(texts_lst, tags_lst, max_len_variable, max_variables_count=800)

In [8]:
len(dataset)

400

In [9]:
len(dataset[127]['tags'])

800

### Model Building and Training

In [10]:
import torch
from torch import nn, optim
import pytorch_lightning as pl

In [11]:
class SentenceTransformer(nn.Module):
    def __init__(self, embedding_model):
        super().__init__()
        self.embed = embedding_model
        self.dropout = nn.Dropout(0.1)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(7680, 2560)
        self.linear2 = nn.Linear(2560, 768)
    
    def forward(self, tokens):
        with torch.no_grad():
            output = self.embed(**tokens)[0]
        output = self.flatten(output)
        self.dropout = nn.Dropout(0.1)
        output = self.linear1(output)
        self.dropout = nn.Dropout(0.1)
        output = self.linear2(output)
        return output
    

class TaggerModel(pl.LightningModule):
    def __init__(self, embedding_model, num_labels):
        super().__init__()
        self.training_step_outputs = []
        self.sentence_embedding = SentenceTransformer(embedding_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=768, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
        self.dropout = nn.Dropout(0.1)
        self.lstm = nn.LSTM(768, 256, num_layers=2, bidirectional=True)
        self.linear1 = nn.Linear(512, 256)
        self.linear2 = nn.Linear(256, num_labels)
        
    def forward(self, x):
        embeddings = []
        for sen in x:
            embeddings.append(self.sentence_embedding(sen))
        embeddings = torch.stack(embeddings)
        output = self.transformer_encoder(embeddings)
        output = output.permute(1,0,2)
        output, _ = self.lstm(output)
        output = self.dropout(output)
        output = self.linear1(output)
        output = self.linear2(output)
        return output
    
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.  
        # it is independent of forward
        x = batch
        y = x.pop('tags')
        y = y.view(-1)
        logits = self.forward(x['variables'])
        logits = logits.view(-1, logits.size(2))
        loss_function = nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_function(logits, y)
        self.log("train_loss", loss)
        self.training_step_outputs.append(loss)
        return loss

    def on_train_epoch_end(self):
        # Log the average loss per epoch
        avg_loss = torch.stack(self.training_step_outputs).mean()
        self.log('avg_train_loss', avg_loss, on_step=False, on_epoch=True)
        self.training_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [12]:
device = "cuda"
num_labels = len(label_encoder.classes_)
embedding_model = BertModel.from_pretrained("bert-base-uncased")
model = TaggerModel(embedding_model, num_labels)
model = model.to(device)
print("Num params: ", sum(p.numel() for p in model.parameters()))
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Num params:  168066797


TaggerModel(
  (sentence_embedding): SentenceTransformer(
    (embed): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=T

In [13]:
len(dataset)

400

In [14]:
batch_size = 10
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
trainer = pl.Trainer(max_epochs=50, log_every_n_steps=1)
trainer.fit(model=model, train_dataloaders=dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                | Type                | Params
------------------------------------------------------------
0 | sentence_embedding  | SentenceTransformer | 131 M 
1 | transformer_encoder | TransformerEncoder  | 33.1 M
2 | dropout             | Dropout             | 0     
3 | lstm                | LSTM                | 3.7 M 
4 | linear1             | Linear              | 131 K 
5 | linear2             | Linear              |

Training: 0it [00:00, ?it/s]

### Testing

In [34]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from tqdm import tqdm

In [29]:
model = TaggerModel.load_from_checkpoint(
    r"C:\Users\Manohar\Desktop\Projects\Finance-Extraction\notebooks\lightning_logs\version_2\checkpoints\epoch=41-step=1680.ckpt",
    embedding_model = embedding_model,
    num_labels = num_labels
).to(device)

In [24]:
model

TaggerModel(
  (sentence_embedding): SentenceTransformer(
    (embed): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=T

In [35]:
# training data
actual = []
predicted = []
batch_size = 10
dataloader = DataLoader(dataset, batch_size=batch_size)

for step, batch in enumerate(tqdm(dataloader)):
    x = [{key: tensor.to(device) for key, tensor in x.items()} for x in batch['variables']]
    y = batch.pop('tags').to(device)
    y = y.view(-1)
    actual += y.tolist()
    with torch.no_grad():
        logits = model(x)
    logits = logits.view(-1, logits.size(2))
    pred = torch.argmax(logits, dim=1)
    predicted += pred.tolist()

100%|██████████| 40/40 [06:37<00:00,  9.93s/it]


In [36]:
f_actual = []
f_predicted = []
for i in range(len(actual)):
    if actual[i] == -100:
        continue
    f_actual.append(actual[i])
    f_predicted.append(predicted[i])

In [38]:
print(classification_report(f_actual, f_predicted, labels=list(range(0,9))))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    312809
           1       0.00      0.00      0.00        98
           2       0.00      0.00      0.00        98
           3       0.00      0.00      0.00        16
           4       0.00      0.00      0.00        98
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00        21

   micro avg       0.98      1.00      0.99    313146
   macro avg       0.11      0.11      0.11    313146
weighted avg       0.98      1.00      0.99    313146



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Debug

In [None]:
dataloader = DataLoader(dataset, batch_size=2)
a = next(iter(dataloader))

In [None]:
embedding_model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
tokens = a['variables'][0]

In [None]:
tokens['input_ids'].shape

In [None]:
import torch
from torch import nn, optim
import pytorch_lightning as pl

class SentenceTransformer(nn.Module):
    def __init__(self, embedding_model):
        super().__init__()
        self.training_step_outputs = []
        self.embed = embedding_model
        self.dropout = nn.Dropout(0.1)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(7680, 2560)
        self.linear2 = nn.Linear(2560, 768)
    
    def forward(self, tokens):
        with torch.no_grad():
            output = self.embed(**tokens)[0]
        output = self.flatten(output)
        self.dropout = nn.Dropout(0.1)
        output = self.linear1(output)
        self.dropout = nn.Dropout(0.1)
        output = self.linear2(output)
        return output
    

class TaggerModel(pl.LightningModule):
    def __init__(self, embedding_model, num_labels):
        super().__init__()
        self.sentence_embedding = SentenceTransformer(embedding_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=768, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
        self.dropout = nn.Dropout(0.1)
        self.lstm = nn.LSTM(768, 256, num_layers=2, bidirectional=True)
        self.linear1 = nn.Linear(512, 256)
        self.linear2 = nn.Linear(256, num_labels)
        
    def forward(self, x):
        embeddings = []
        for sen in x:
            embeddings.append(self.sentence_embedding(sen))
        embeddings = torch.stack(embeddings)
        output = self.transformer_encoder(embeddings)
        output = output.permute(1,0,2)
        output, _ = self.lstm(output)
        output = self.dropout(output)
        output = self.linear1(output)
        output = self.linear2(output)
        return output
    
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.  
        # it is independent of forward
        x = batch
        y = x.pop('tags')
        y = y.view(-1)
        logits = self.forward(x['variables'])
        logits = logits.view(-1, logits.size(2))
        loss_function = nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_function(logits, y)
        self.log("train_loss", loss)
        self.training_step_outputs.append(loss)
        return loss

    def on_train_epoch_end(self):
        # Log the average loss per epoch
        avg_loss = torch.stack(self.training_step_outputs).mean()
        self.log('avg_train_loss', avg_loss, on_step=False, on_epoch=True)
        self.training_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [None]:
num_labels = len(label_encoder.classes_)
model = TaggerModel(embedding_model, num_labels)

In [None]:
model(a['variables']).shape

In [None]:
num_labels