### Data Processing

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from glob import glob
from torch.utils.data import DataLoader, Dataset
import torch

In [2]:
table_names = ['income statement', 'balance sheet', 'cash flow']
texts_lst = []
tags_lst = []
for path in glob(r"C:\Users\Manohar\Desktop\Projects\Finance-Extraction\data\current\*\*\tagged_tables.xlsx"):
    for t_name in table_names:
        try:
            df = pd.read_excel(path, sheet_name=t_name)
        except ValueError:
            continue
        df = df.dropna(subset=["Unnamed: 0"])
        tags = [""] + df['Tag'].fillna("").to_list()
        if len("".join(tags).strip()) == 0:
            continue
        texts = []
        for _t in df['Unnamed: 0']:
            texts.append(" ".join(_t.split()[:10]))
        texts_lst.append([t_name] + texts)
        tags_lst.append(tags)

In [3]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [4]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
_d = []
for ls in tags_lst:
    _d += ls
label_encoder = label_encoder.fit(_d)

In [5]:
def get_tokenized_data(text_lst, tags, max_length):
    tokenizer_output = {
        'input_ids': [101], 
        'token_type_ids': [0], 
        'attention_mask': [1],
        'labels': [0]
    }
    for i, text in enumerate(text_lst):
        tok_out = tokenizer(text)
        tokenizer_output['input_ids'] += tok_out['input_ids'][1:-1]
        tokenizer_output['token_type_ids'] += tok_out['token_type_ids'][1:-1]
        tokenizer_output['attention_mask'] += tok_out['attention_mask'][1:-1]
        tokenizer_output['labels'] += list(label_encoder.transform([tags[i]])) * len(tok_out['input_ids'][1:-1])
    
    tokenizer_output['input_ids'].append(102)
    tokenizer_output['token_type_ids'].append(0)
    tokenizer_output['attention_mask'].append(1)
    tokenizer_output['labels'].append(0)

    # padding
    tokens_ln = len(tokenizer_output['input_ids'])
    tokenizer_output['input_ids'] += [0] * (max_length-tokens_ln)
    tokenizer_output['token_type_ids'] += [0] * (max_length-tokens_ln)
    tokenizer_output['attention_mask'] += [0] * (max_length-tokens_ln)
    tokenizer_output['labels'] += [-100] * (max_length-tokens_ln)

    # changing datapoint
    tokenizer_output['input_ids'] = torch.tensor(tokenizer_output['input_ids'])
    tokenizer_output['token_type_ids'] = torch.tensor(tokenizer_output['token_type_ids'])
    tokenizer_output['attention_mask'] = torch.tensor(tokenizer_output['attention_mask'])
    tokenizer_output['labels'] = torch.tensor(tokenizer_output['labels'])

    return tokenizer_output

In [7]:
max_length = 3000
data = []
for i in range(len(texts_lst)):
    data.append(get_tokenized_data(texts_lst[i], tags_lst[i], max_length))

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [8]:
dataset = CustomDataset(data)

In [9]:
len(dataset)

400

### Model Building and Training

In [10]:
import torch
from torch import nn, optim
import pytorch_lightning as pl

In [11]:
class NERModel(pl.LightningModule):
    def __init__(self, embedding_model, num_labels):
        super().__init__()
        self.training_step_outputs = []
        self.embed = embedding_model
        self.dropout = nn.Dropout(0.1)
        self.lstm = nn.LSTM(768, 256, num_layers=2, bidirectional=True)
        self.linear4 = nn.Linear(768, num_labels)
    
    def forward(self, tokens):
        with torch.no_grad():
            outputs = self.embed(**tokens)
        embeddings = outputs.last_hidden_state
        output, _ = self.lstm(embeddings)
        output = self.dropout(output)
        linear4_output = self.linear4(output)
        return linear4_output
    
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.  
        # it is independent of forward
        x = batch
        y = x.pop('labels')
        y = y.view(-1)
        logits = self.forward(x)
        logits = logits.view(-1, logits.size(2))
        loss_function = nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_function(logits, y)
        self.log("train_loss", loss)
        self.training_step_outputs.append(loss)
        return loss

 
    def on_train_epoch_end(self):
        # Log the average loss per epoch
        avg_loss = torch.stack(self.training_step_outputs).mean()
        self.log('avg_train_loss', avg_loss, on_step=False, on_epoch=True)
        self.training_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [12]:
dataloader = DataLoader(dataset, batch_size=1)
a = next(iter(dataloader))

In [13]:
a

{'input_ids': tensor([[ 101, 3318, 4861,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[   0,    0,    0,  ..., -100, -100, -100]])}

In [20]:
a = {key: tensor.to(device) for key, tensor in a.items()}

In [21]:
model(a)

RuntimeError: The size of tensor a (3000) must match the size of tensor b (512) at non-singleton dimension 1

In [14]:
device = "cuda"
num_labels = len(label_encoder.classes_)
embedding_model = BertModel.from_pretrained("bert-base-uncased")
model = NERModel(embedding_model, num_labels)
model = model.to(device)
print("Num params: ", sum(p.numel() for p in model.parameters()))
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Num params:  113342701


NERModel(
  (embed): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [186]:
len(dataset)

400

In [187]:
batch_size = 120
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
trainer = pl.Trainer(max_epochs=50)
trainer.fit(model=model, train_dataloaders=dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name    | Type      | Params
--------------------------------------
0 | embed   | BertModel | 109 M 
1 | dropout | Dropout   | 0     
2 | lstm    | LSTM      | 3.7 M 
3 | linear4 | Linear    | 182 K 
--------------------------------------
113 M     Trainable params
0         Non-trainable params
113 M     Total params
453.371   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

AttributeError: 'list' object has no attribute 'view'