In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
from torch.nn.functional import cross_entropy
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import logging
from torcheval.metrics.functional import multiclass_f1_score, multiclass_confusion_matrix
from copy import deepcopy, copy
import seaborn as sns
import pandas as pd
from huggingface_hub import notebook_login
from torch.utils.data import DataLoader
from collections import defaultdict, deque

import os 
while 'notebooks' in os.getcwd():
    os.chdir("..")

import re
from typing import List, Dict
    
from src.preprocessing.laser.laser_processor import LaserProcessor
import warnings
warnings.filterwarnings("ignore")

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!mkdir logs
model_name = "LASER-CLF"
dataset_name = "FUNSD"
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.basicConfig(filename=f'logs/{model_name}_{dataset_name}.log', encoding='utf-8', level= logging.INFO)

mkdir: cannot create directory ‘logs’: File exists


## Training binary classifier modules

In [5]:

model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2", 
    num_labels=2

)

tokenizer = AutoTokenizer.from_pretrained("gpt2")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset = load_dataset("nielsr/funsd")

In [7]:
train_dataset = dataset['train']

In [8]:
laser_data = LaserProcessor(
    train_dataset,
    tokenizer=None
)

  0%|          | 0/149 [00:00<?, ?it/s]

100%|██████████| 149/149 [00:25<00:00,  5.93it/s]


In [9]:
special_chars = set(["[B]", "[E]", "[T]", "QUESTION", "ANSWER", "NONE", "HEADER"])

In [14]:
loss_fn = torch.nn.CrossEntropyLoss()
device = "cuda"
model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2", 
    num_labels=2,
).to(device)

tokenizer.add_special_tokens({
    'pad_token': '[PAD]',
})

tokenizer.add_tokens([
    "[B]",
    "[E]",
    "[T]"
])

model.resize_token_embeddings(len(tokenizer))
# for param in model.distilbert.parameters():
#     param.requires_grad = False

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 1e-5
)

# 0 -> not in src
# 1 -> in src

n_shots_train = 7
n_shots_val = 10
n_epochs = 2
best_f1 = 0
best_model = None
for epoch in range(n_epochs):
    for i in tqdm(range(n_shots_train)):
        X, y = laser_data[i]
        out_stack = deque(y.split(' '),)
        in_stack = deque(X.split(' '), maxlen=16)
        y_true_train = []
        y_pred_train = []

        next_token = None
        while len(out_stack) >0:
            input = tokenizer(
                ' '.join(in_stack),
                truncation= True,
                padding= "max_length",
                return_tensors= "pt",
                max_length=32,
            )

            for k, v in input.items():
                input[k] = v.to(device)

            if next_token is None:
                labels = torch.tensor(0).to(device)
            elif next_token in special_chars:
                labels = torch.tensor(0).to(device)
            
            else:
                labels = torch.tensor(1).to(device)
            
            optimizer.zero_grad()

            model_output = model(**input).logits.squeeze()
            loss = loss_fn(model_output, labels)

            y_pred_train.append(model_output.argmax().item())
            y_true_train.append(labels)

            loss.backward()
            optimizer.step()

            next_token = out_stack.popleft()
            in_stack.append(next_token)

        # validation 
        with torch.no_grad():
            y_pred = []
            y_true = []
            for i in range(n_shots_train, n_shots_train + n_shots_val):
                X, y = laser_data[i]
                out_stack = deque(y.split(' '))
                in_stack = deque(X.split(' '), maxlen=16)

                next_token = None
                while len(out_stack) >0:
                    input = tokenizer(
                        ' '.join(in_stack),
                        truncation= True,
                        padding= "max_length",
                        return_tensors= "pt",
                        max_length=32,
                    )

                    for k, v in input.items():
                        input[k] = v.to(device)

                    if next_token is None:
                        labels = torch.tensor(0).to(device)
                    elif next_token in special_chars:
                        labels = torch.tensor(0).to(device)
                    
                    else:
                        labels= torch.tensor(1).to(device)
                    
                    model_output = model(**input).logits.squeeze()
                    y_pred.append(model_output.argmax().item())
                    y_true.append(labels)

                    next_token = out_stack.popleft()
                    in_stack.append(next_token) 

            f1 = multiclass_f1_score(torch.Tensor(y_pred), torch.Tensor(y_true), num_classes=2)

            if f1 > best_f1:
                best_f1 = f1
                best_model = deepcopy(model)

            logging.info(f"Validation F1 - score: {f1}")
            logging.info(
                multiclass_confusion_matrix(
                    torch.Tensor(y_pred,).to(torch.int64), 
                    torch.Tensor(y_true).to(torch.int64), 
                    num_classes=2
                )
            )
best_model.push_to_hub("peulsilva/LASER-CLF-GPT")

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50261. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
 14%|█▍        | 1/7 [00:31<03:06, 31.08s/it]


KeyboardInterrupt: 

In [10]:
trained_model = AutoModelForSequenceClassification.from_pretrained(
    "peulsilva/LASER-CLF", 
    num_labels=2,
).to(device)

Downloading config.json:   0%|          | 0.00/566 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [23]:
# validation 
n_shots_train = 7
n_shots_val = 149 - n_shots_train
tokenizer.add_special_tokens({
    'pad_token': '[PAD]',
})

tokenizer.add_tokens([
    "[B]",
    "[E]",
    "[T]"
])

trained_model.resize_token_embeddings(len(tokenizer))
with torch.no_grad():
    y_pred = []
    y_true = []
    for i in tqdm(range(n_shots_train, n_shots_train + n_shots_val)):
        X, y = laser_data[i]
        out_stack = deque(y.split(' '))
        in_stack = deque(X.split(' '), maxlen=3)

        next_token = None
        while len(out_stack) >0:
            input = tokenizer(
                ' '.join(in_stack),
                truncation= True,
                padding= "max_length",
                return_tensors= "pt",
                max_length=3,
            )

            for k, v in input.items():
                input[k] = v.to(device)

            if next_token is None:
                labels = torch.tensor(0).to(device)
            elif next_token in special_chars:
                labels = torch.tensor(0).to(device)
            
            else:
                labels= torch.tensor(1).to(device)
            
            model_output = trained_model(**input).logits.squeeze()
            y_pred.append(model_output.argmax().item())
            y_true.append(labels)

            next_token = out_stack.popleft()
            in_stack.append(next_token) 

    f1 = multiclass_f1_score(torch.Tensor(y_pred), torch.Tensor(y_true), num_classes=2)
    logging.info(f"Validation F1 - score: {f1}")
    logging.info(
        multiclass_confusion_matrix(
            torch.Tensor(y_pred,).to(torch.int64), 
            torch.Tensor(y_true).to(torch.int64), 
            num_classes=2
        )
    )


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 30525. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
  0%|          | 0/142 [00:00<?, ?it/s]

100%|██████████| 142/142 [01:18<00:00,  1.82it/s]


In [24]:
f1 = multiclass_f1_score(torch.Tensor(y_pred), torch.Tensor(y_true), num_classes=2)
conf = multiclass_confusion_matrix(
            torch.Tensor(y_pred,).to(torch.int64), 
            torch.Tensor(y_true).to(torch.int64), 
            num_classes=2
        )

In [25]:
f1

tensor(0.7519)