In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import transformers
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification,AutoModelForCausalLM
from datasets import load_dataset
import torch
from torch.nn.functional import cross_entropy
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import logging
from torcheval.metrics.functional import multiclass_f1_score, multiclass_confusion_matrix
from copy import deepcopy, copy
import seaborn as sns
import pandas as pd
from huggingface_hub import notebook_login
from torch.utils.data import DataLoader
from collections import defaultdict, deque

import os 
while 'notebooks' in os.getcwd():
    os.chdir("..")

import re
from typing import List, Dict
    
from src.preprocessing.laser.laser_processor import LaserProcessor
import warnings
warnings.filterwarnings("ignore")

## LASER

In [3]:
!mkdir logs
model_name = "LASER"
dataset_name = "FUNSD"
# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
logging.basicConfig(filename=f'logs/{model_name}_{dataset_name}.log', encoding='utf-8', level= logging.INFO)

mkdir: cannot create directory ‘logs’: File exists


## Importing model (GPT)


In [4]:
# tokenizer = AutoTokenizer.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")
# clf = AutoModelForSequenceClassification.from_pretrained(
#     "peulsilva/LASER-CLF-GPT", 
#     num_labels=2,
# ).to(device)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir = "/Data/pedro.silva/",padding_side = "left")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir = "/Data/pedro.silva/")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
sum(p.numel() for p in model.parameters())

7241732096

In [7]:
vector = model.transformer.wte.weight

AttributeError: 'MistralForCausalLM' object has no attribute 'transformer'

In [8]:
vector.shape

NameError: name 'vector' is not defined

## Importing FUNSD Dataset

In [9]:
dataset = load_dataset("nielsr/funsd")

Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [10]:
train_split = dataset['train']

In [11]:
label_names = train_split\
            .features['ner_tags']\
            .feature\
            .names
label_keymap = {k:v for k,v in enumerate(label_names)} 
label_keymap

{0: 'O',
 1: 'B-HEADER',
 2: 'I-HEADER',
 3: 'B-QUESTION',
 4: 'I-QUESTION',
 5: 'B-ANSWER',
 6: 'I-ANSWER'}

In [12]:
# tokenizer.add_special_tokens({
#     'pad_token': '[PAD]',
# })

tokenizer.add_tokens([
    "[B]",
    "[E]",
    "[T]"
])

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

Embedding(32003, 4096)

In [13]:
laser_data = LaserProcessor(
    train_split,
    tokenizer=tokenizer
)

  0%|          | 0/149 [00:00<?, ?it/s]

100%|██████████| 149/149 [00:44<00:00,  3.32it/s]


In [14]:
X, y = laser_data[1]

In [15]:
dataloader = DataLoader(
    laser_data[0:100],
)

In [16]:
special_chars = set(["[B]", "[E]", "[T]", "QUESTION", "ANSWER", "NONE", "HEADER"])
tag_ids = []
for special_char in special_chars:
    if special_char[0] != '[':
        special_char = special_char.lower()
        
    special_char_id = tokenizer.vocab[special_char]
    tag_ids.append(special_char_id)

In [17]:
tag_ids

[24115, 17496, 32000, 4983, 8607, 32002, 32001]

In [38]:
embeddings = model\
    .transformer\
    .wte\
    .weight\
    .to(device)

AttributeError: 'DistilBertForMaskedLM' object has no attribute 'transformer'

In [19]:
embeddings

Parameter containing:
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0013, -0.0267,  0.0183,  ...,  0.0220,  0.0232, -0.0007],
        [ 0.0289,  0.0136,  0.0223,  ..., -0.0047, -0.0088, -0.0342],
        [-0.0078, -0.0201, -0.0052,  ..., -0.0302, -0.0199, -0.0076]],
       requires_grad=True)

In [39]:
def get_predictions_from_list(
    next_word_predictions : torch.Tensor,
    token_ids : List[int],
    probability : float
):
    predictions = []
    for word_id in token_ids:
        predictions.append(
            [next_word_predictions[word_id].item(), word_id],
        )

    predictions_tensor = torch.Tensor(predictions)\
        .to(device)
        # .softmax(dim = 0)

    predictions_tensor[:,0] = predictions_tensor[:,0]\
        .softmax(dim = 0)

    predictions_tensor[:,0] *= probability

    return predictions_tensor

In [40]:
def generate_corrected_predictions(
    predictions_src : torch.Tensor,
    predictions_tag: torch.Tensor,
    next_word_predictions : torch.Tensor
):
    new_logits = torch.zeros_like(next_word_predictions)

    for idx, word_id in enumerate(predictions_src[:, 1]):
        token_id = int(word_id)
        new_logits[token_id] = predictions_src[idx, 0]

    for idx, tag_id in enumerate(predictions_tag[: ,1]):
        token_id = int(tag_id)
        new_logits[token_id] = predictions_tag[idx, 0]

    return new_logits

In [41]:
def generate_label(
    next_word_predictions : torch.Tensor,
    token_label_id : int
):
    label = torch.zeros_like(next_word_predictions)
    label[token_label_id] = 1

    return label

In [42]:
def get_last_n_tokens(
    n:int,
    X: list, 
    generated_text: list,
    use_mask : bool = False
):
    if use_mask:
        all_tokens = (X[0] + " ".join(generated_text) + " " + tokenizer.mask_token).split('-') 
    else:
        all_tokens = (X[0] + " ".join(generated_text)).split('-')
        
    last_n_tokens = " ".join(all_tokens[-n:])
    
    return last_n_tokens

In [53]:
# model = GPT2LMHeadModel.from_pretrained(
#     "gpt2",
#     output_hidden_states =True
# ).to(device)

model.resize_token_embeddings(len(tokenizer))
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 1e-4
)

n_epochs = 5

for epoch in (range(n_epochs)):
    losses = []
    for [X, y] in (dataloader):
        generated_text = []
        
        label_tokens = tokenizer(
            y, 
        ).input_ids[0]
            
        src =  np.unique(tokenizer(X[0], truncation=True)['input_ids'])
        for label_token in tqdm(label_tokens):
            
            batch = tokenizer(
                get_last_n_tokens(256, X, generated_text),
                truncation= True,
                padding= "max_length",
                return_tensors= "pt",
                max_length=256,
            )

            
            with torch.no_grad():

                clf_batch = tokenizer(
                    get_last_n_tokens(32, X, generated_text),
                    truncation= True,
                    padding= "max_length",
                    return_tensors= "pt",
                    max_length=32,
                )

                p_k = clf(**clf_batch)\
                    .logits\
                    .softmax(dim = 1)[0,1]\
                    .item()
                
            outputs = model(**batch)
            next_word_predictions = outputs[0][0,-1,:]        

            predictions_src = get_predictions_from_list(
                next_word_predictions,
                src,
                p_k
            )
            # predictions_src.requires_grad = True
                
            predictions_tag = get_predictions_from_list(
                next_word_predictions,
                tag_ids, 
                1-p_k
            )
            # predictions_tag.requires_grad = True

            idx, max_proba_tag = predictions_tag[:, 0].argmax(), predictions_tag[:, 0].max()
            best_tag_word_id = predictions_tag[idx,1]\
                .item()
            best_tag_word_id = int(best_tag_word_id)

            idx, max_proba_src = predictions_src[:, 0].argmax(), predictions_src[:, 0].max()
            best_src_word_id = predictions_src[idx, 1]\
                .item()
            best_src_word_id = int(best_src_word_id)

            if max_proba_src > max_proba_tag:
                next_word = tokenizer.decode(best_src_word_id)
            
            else:
                next_word = tokenizer.decode(best_tag_word_id)
            
            generated_text.append(next_word)
            
            # logging.info(f"next word : {next_word}")

            logits = generate_corrected_predictions(
                predictions_tag,
                predictions_src,
                next_word_predictions
            )

            logits.requires_grad = True

            label_tensor = generate_label(
                next_word_predictions,
                label_token
            )

            loss = cross_entropy(
                logits,
                label_tensor
            )

            # logging.info(logits)
            # logging.info(label_tensor)
            # logging.info(loss.item())
                    
            optimizer.zero_grad()
            
            # loss = out['loss']

            loss.backward()
            optimizer.step()

        logging.info(" ".join(generated_text))

        # logging.info(
        #     tokenizer.decode(
        #         y_pred.to(torch.int32),
        #         skip_special_tokens=True
        #     )
        # )
        # logging.info(X[0])
        # logging.info(loss.item())

  0%|          | 0/293 [00:00<?, ?it/s]

100%|██████████| 293/293 [00:30<00:00,  9.75it/s]
100%|██████████| 751/751 [01:20<00:00,  9.28it/s]
100%|██████████| 300/300 [00:31<00:00,  9.67it/s]
100%|██████████| 1308/1308 [02:20<00:00,  9.32it/s]
100%|██████████| 279/279 [00:27<00:00, 10.15it/s]
100%|██████████| 426/426 [00:46<00:00,  9.24it/s]
100%|██████████| 497/497 [00:53<00:00,  9.26it/s]
100%|██████████| 507/507 [00:53<00:00,  9.46it/s]
100%|██████████| 430/430 [00:43<00:00,  9.83it/s]
100%|██████████| 535/535 [00:54<00:00,  9.77it/s]
100%|██████████| 293/293 [00:30<00:00,  9.64it/s]
100%|██████████| 751/751 [01:20<00:00,  9.39it/s]
100%|██████████| 300/300 [00:30<00:00,  9.69it/s]
100%|██████████| 1308/1308 [02:19<00:00,  9.38it/s]
100%|██████████| 279/279 [00:27<00:00, 10.22it/s]
100%|██████████| 426/426 [00:45<00:00,  9.33it/s]
100%|██████████| 497/497 [00:53<00:00,  9.29it/s]
100%|██████████| 507/507 [00:53<00:00,  9.48it/s]
100%|██████████| 430/430 [00:43<00:00,  9.84it/s]
100%|██████████| 535/535 [00:54<00:00,  9.82it

In [39]:
batch['input_ids'].device

device(type='cpu')

In [198]:
model = AutoModelForCausalLM.from_pretrained("gpt2-medium")

In [44]:
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium",)

In [18]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.cls_token = "[CLS]"

model.resize_token_embeddings(len(tokenizer))
device = "cuda"
model.to(device)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 1e-6,
    weight_decay=1e-2
)
loss_fn = torch.nn.CrossEntropyLoss()

n_epochs = 15
model.train()

with torch.no_grad():
    for epoch in (range(n_epochs)):
        losses = []
        i = 0
        for [X, y] in tqdm(dataloader):
            i+=1
            generated_text = []
            
            batch = tokenizer(
                "Independent of what is the input, always generate 'hello world. My name is pedro'." + X[0],
                text_target= "hello world. My name is pedro",
                truncation= True,
                padding= "max_length",
                return_tensors= "pt",
                max_length=1024,
            )

            for k,v in batch.items():
                # if k == "labels":
                batch[k] = v.to(device)

            # optimizer.zero_grad()

            out = model(**batch, )

            # loss= out['loss']
            # loss.backward()

            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # if i % 10 ==0:
            # optimizer.step()

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB (GPU 0; 23.66 GiB total capacity; 14.40 GiB already allocated; 97.88 MiB free; 14.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [200]:
labels = batch['labels']
# Shift so that tokens < n predict n
shift_logits = out['logits'][..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
hidden_states = out[0]
loss = loss.to(hidden_states.dtype)


In [201]:
shift_labels

tensor([[  995,    13,  2011,  1438,   318,  7190,   305, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256]], device='cuda:0')

In [214]:
shift_logits.view(-1, shift_logits.size(-1))[11].argmax()

tensor(6, device='cuda:0')

In [193]:
batch['labels']

tensor([[31373,   995,    13,  2011,  1438,   318,  7190,   305, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256]], device='cuda:0')

In [194]:
loss

tensor(1.6702, device='cuda:0', grad_fn=<NllLossBackward0>)

In [131]:
out['logits'][0,-1, :].shape

torch.Size([50261])

In [216]:
batch_ = tokenizer(
    X[0] ,
    # text_target=y[0],
    truncation= True,
    padding= "max_length",
    return_tensors= "pt",
    max_length=256,
)
for k,v in batch_.items():
    batch_[k] = v.to(device)

    
gen_text = model.generate(
    **batch_,
    max_length=150,  
    # num_return_sequences=5,
    # no_repeat_ngram_size=2,
    # repetition_penalty=1.5,
    # top_p=0.92,
    # temperature=.85,
    # do_sample=True,
    # top_k=125,
    # early_stopping=True
)

tokenizer.decode(gen_text[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


'DATE: INITIATED BY: COMPLETION COUNTRY: PRODUCT: BELGIUM BY: DATE: NOTES: 620429480 RECEIVED SERVICE REQUEST FROM R & D BY INTERNATIONAL LICENSEE OPERATIONS REQUEST NO.: 25- 84 March 27, 1984 P. H. HARPER PHH TARGET DATE: April 13, 1984 LUCKY STRIKE Filter and VICEROY NATURE OF WORK: Advise if locally obtained Yucatan Honey (sample enclosed) is an acceptable substitute for HALWAY. R & D COMMENTS: Target the P A 29Mar 84 a) Nature of work should be specified in exact terms. b) R & D should advise if completion date cannot be met. c) Two copies of this form to be sent to R & D by initiator and R & D is to return to T. O. one completed copy. MH/ enm 0036/ (r) # 2894M APR 2 1984 P. H. H.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

In [62]:
from transformers import pipeline

generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device = "cuda")

In [63]:
generator(X[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "DATE: DEPARTMENT: Type EQPR B&W QUALITY Implement: Date 632120763 QUALITY IMPROVEMENT SUGGESTION OUR MISSION IS SUPERIOR CONSUMER SATISFACTION Highest Quality. Teamwork, Do Right Things Right The First Time RESEARCH & DEVELOPMENT Quality Coord Only June 21, 1993 R&D Library Carol S. Lincoln 407- 64- 3484 SUBMITTED BY: SUBMITTER'S SS#: Date Rec'd QIP Log #1 Status (1993) Keywords (1993) 6/ 21/ 93 93- 0301 SUGGESTION: (Describe Current Situation and Idea) The current system of managing records is too complex. The trend seems to be increasingly specific, when we should be getting more general. Right now, people must work to understand the system. We must spend too much time adninistering the system, labeling and cleaning our files. Complying is a real burden, both for the individual and for the records coordinators. Describe Possible Solutions And Benefits 1. Drop the category specifications altogether. 2. Use moregeneral categories. hote: I have passed this to Scott 

In [None]:
a

In [65]:
n_epochs = 5
model = GPT2LMHeadModel.from_pretrained(
    'gpt2',
    output_hidden_states =True
)

clf = AutoModelForSequenceClassification.from_pretrained(
    "peulsilva/LASER-CLF-GPT", 
    num_labels=2,
).to(device)

device = "cuda"
model.resize_token_embeddings(len(tokenizer))
clf.resize_token_embeddings(len(tokenizer))
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 1e-5
)
clf.to(device)
model.to(device)

text = []
loss_fn = torch.nn.CrossEntropyLoss()
for epoch in (range(n_epochs)):
    for [X, y] in tqdm(dataloader):
        target_text = None
        generated_text = []
        
        if type(y) == tuple:
            y = y[0]

        if type(X) == tuple:
            X = X[0]

        in_stack = deque(X.split(' '), maxlen=16)
        out_stack = deque(y.split(' '))
        while len(out_stack)> 0:

            if target_text == None:
                target_text = "[B]"

            batch = tokenizer(
                " ".join(in_stack),
                truncation= True,
                padding= "max_length",
                return_tensors= "pt",
                max_length=32,
            )


            for k, v in batch.items():
                batch[k] = v.to(device)

            is_in_src = clf(**batch)\
                .logits\
                .squeeze()\
                .argmax()
            
            if is_in_src:
                
                generated_text.append(target_text)
                target_text = out_stack.popleft()
                in_stack.append(target_text)
                continue
            
            else:
                output = model.generate(**batch)
                next_token_logits = output.logits[0, :, :]

                logits = []
                real_labels = []
                for word in special_chars:
                    # proba = 
                    if word.startswith("["):
                        continue
                    
                    if not word.startswith('['):
                        word = word.lower()
                        
                    word_token = tokenizer.vocab[word]
                    if word.lower() == target_text.lower():
                        real_labels.append(1)
                    
                    else:
                        real_labels.append(0)
                    
                    logits.append([
                        word_token,
                        next_token_logits[-1, word_token].item()
                    ])

                optimizer.zero_grad()
                logits = torch.tensor(logits, requires_grad=True).to(device)

                ce_loss = cross_entropy(
                    logits[:,1].softmax(dim = 0).to(torch.float64),
                    torch.tensor(real_labels)\
                        .to(torch.float64)\
                        .to(device),
                )
                                
                generated_text.append(
                    tokenizer.decode(
                        int(logits.max(dim=0)[0][0])
                    )
                )

                in_stack.append(
                    tokenizer.decode(
                        int(logits.max(dim=0)[0][0])
                    )
                )
                
                
                ce_loss.backward()
                
                optimizer.step()

                target_text = out_stack.popleft()
                
        text.append(generated_text)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50261. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50261. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
 10%|█         | 1/10 [00:22<03:22, 22.45s/it]


KeyboardInterrupt: 

In [179]:
force_words = list(special_chars)

In [183]:
for word in force_words:
    if not word[0] == "[":
        word = word.lower()
        if word.upper() in force_words:
            force_words.remove(word.upper())
            force_words.append(word.lower())
force_ids= tokenizer(force_words, add_special_tokens= False).input_ids

In [184]:
force_words

['[E]', '[T]', '[B]', 'question', 'header', 'answer', 'none']

In [177]:
word.upper()

'HEADER'

In [185]:
force_ids

[[50259], [50260], [50258], [25652], [25677], [41484], [23108]]

In [187]:

batch = tokenizer(
    X,
    truncation= True,
    padding= "max_length",
    return_tensors= "pt",
    max_length=512,
            
)
for k, v in batch.items():
    batch[k] = v.to(device)

out = model.generate(
    **batch,
    num_beams = 10,
    force_words_ids = force_ids,
    # early_stopping = True,
    # max_new_tokens = 100,
    # do_sample = True,
    # top_p = 0.95,
    # top_k = 10,
    # renormalize_logits =True,
    no_repeat_ngram_size = 1,
    # max_length = 20,
    # min_new_tokens = 100
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 120.00 MiB (GPU 0; 15.71 GiB total capacity; 15.00 GiB already allocated; 17.75 MiB free; 15.39 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [140]:
out.shape

torch.Size([1, 513])

In [141]:
tokenizer.decode(out[0],
                 skip_special_tokens= True)

'Title OVERALL: SEX: AGE: BRAND SMOKER Male 465508326 Female 6 7 (17) (205) 3.1 4.2 (129) (95) 5 9 (117) (105) (63) (66) (46) (49) 3.2 3.0 4.4 4.0 1.9 5.0 (104) (120) 7 7 (101) (121) 3.4% (224) 7% (222) BELAIR 285 % % SCORE BASE SCORE BASE PM6 COMMENTS Brand NEWSPAPER SCORES AUDIENCE STUDIES " KALEIDOSCOPE - - GONE WITH THE WIND " * Project # Total Sample 72- 31 Code # Type of Ad Newspaper (Date) R/ BSS- 71- 19 PARADE, 1 Page, 4- Color PROVED RECALL *This was a combination ad with RALEIGH. Under 25 25- 34 35- 44 45 & Over Under 35 35 & Over Test Brand Smokers All Other Smokers Pittsburgh Pittsburgh PUTSBURGH PRESS (4 /23 /72 San Diego SAN DIEGO UNION (4 /23 /72 Dayton DAYTON NEWS (4 /23 /72) Birmingham BIRMINGHAM NEWS (4 /23 /72)\uf702[E]'

In [68]:
batch = tokenizer(
    " ".join(in_stack),
    
    truncation= True,
    padding= "max_length",
    return_tensors= "pt",
    max_length=32,
)

{'input_ids': tensor([[41484,  3280,  3280,  3280,  3280,  3280,  3280,  3280,  3280,  3280,
          3280,  3280,  3280,  3280,  3280,  3280, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])}

In [21]:
output = model(**batch)
next_token_logits = output.logits[0, :, :]

In [22]:
output.logits[0].shape

torch.Size([32, 50261])

In [23]:
logits = []
real_labels = []
for word in special_chars:
    if not word.startswith('['):
        word = word.lower()
        
    word_token = tokenizer.vocab[word]
    if word.lower() == target_text.lower():
        real_labels.append(1)
    
    else:
        real_labels.append(0)
    
    logits.append([
        word_token,
        next_token_logits[-1, word_token].item()
    ])

In [24]:
logits = torch.tensor(logits, requires_grad=True).to(device)

In [25]:
logits

tensor([[ 2.5677e+04, -8.7671e+01],
        [ 5.0260e+04, -4.7957e+00],
        [ 2.3108e+04, -8.8531e+01],
        [ 5.0259e+04, -5.7342e+00],
        [ 2.5652e+04, -9.0468e+01],
        [ 4.1484e+04, -9.2401e+01],
        [ 5.0258e+04,  1.5480e+00]], device='cuda:0',
       grad_fn=<ToCopyBackward0>)

In [26]:
logits, real_labels

(tensor([[ 2.5677e+04, -8.7671e+01],
         [ 5.0260e+04, -4.7957e+00],
         [ 2.3108e+04, -8.8531e+01],
         [ 5.0259e+04, -5.7342e+00],
         [ 2.5652e+04, -9.0468e+01],
         [ 4.1484e+04, -9.2401e+01],
         [ 5.0258e+04,  1.5480e+00]], device='cuda:0',
        grad_fn=<ToCopyBackward0>),
 [0, 0, 0, 0, 0, 0, 1])

In [47]:
out_stack

deque(['Total',
       'Pressure',
       'Drop',
       '(encap.)',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Tipping',
       'Length',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Print',
       'Position',
       '(from',
       'filter',
       'end)',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Moisture',
       'content',
       '(Packing)',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Filter',
       'Ventilation',
       'Rate',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Total',
       'Cigarette',
       'Weight',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Net',
       'Net',
       'Tobacco',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Tobacco',
       'Rod',
       'Density',
       '[E]',
       'QUESTION',
       '[T]',
       '[B]',
       'Tipping',
       'and',
       'Tipping',
       'Applica

In [44]:
generated_text[-50:]

['[E]',
 '[T]',
 '[T]',
 '[T]',
 '[T]',
 'Rod',
 'Length',
 '[E]',
 '[T]',
 '[T]',
 '[T]',
 '[T]',
 'Plug',
 'Length',
 '[E]',
 '[T]',
 '[T]',
 '[T]',
 '[T]',
 'Plug',
 'Pressure',
 'Drop',
 '(unencap.)',
 '[E]',
 '[T]',
 '[T]',
 '[T]',
 '[T]',
 'Plug',
 'Pressure',
 'Drop',
 '(encap.)',
 '[T]',
 '[T]',
 '[T]',
 '[T]',
 '[T]',
 'Circumference',
 '[E]',
 '[T]',
 '[T]',
 '[T]',
 '[T]',
 'Pressure',
 'Drop',
 '(unencap.)',
 '[E]',
 '[T]',
 '[T]',
 '[T]']