In [3]:
!pip install "torch>=1.12" "torchtext>=0.13" transformers

Successfully installed torch-1.12.0 torchtext-0.13.0


In [18]:
from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-uncased")

Downloading: 100%|██████████| 420M/420M [00:04<00:00, 102MB/s]  
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
payload="Hello World, How are you!"

In [83]:
from transformers import AutoTokenizer

trfs_tok = AutoTokenizer.from_pretrained('bert-base-uncased')
trfs_ex = trfs_tok([payload,"this is a longer sequence to see if padding would work"],padding=True,return_tensors="pt")
model(**trfs_ex)
trfs_ex

{'input_ids': tensor([[  101,  7592,  2088,  1010,  2129,  2024,  2017,   999,   102,     0,
             0,     0,     0,     0],
        [  101,  2023,  2003,  1037,  2936,  5537,  2000,  2156,  2065, 11687,
          4667,  2052,  2147,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [50]:
%%timeit
trfs_ex = trfs_tok(payload,"this is a longer sequence to see if padding would work",padding=True,return_tensors="pt")
trfs_ex

134 µs ± 938 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [34]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path),
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
)
text_transform(payload)

[0, 35378, 6661, 4, 11249, 621, 398, 38, 2]

In [24]:
from torchtext.transforms import BERTTokenizer
VOCAB_FILE = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
tokenizer = BERTTokenizer(vocab_path=VOCAB_FILE, do_lower_case=True, return_tokens=False)
pt_ex = tokenizer("Hello World, How are you!") # single sentence input

In [51]:
max_seq_len=512
bos_idx=101
eos_idx=102
padding_value=0
tokenizer = T.Sequential(
    T.BERTTokenizer(vocab_path=VOCAB_FILE, do_lower_case=True, return_tokens=False),
    T.StrToIntTransform(),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
    T.ToTensor(padding_value=padding_value)
)
tokenizer([payload,"this is a longer sequence to see if padding would work"])

tensor([[  101,  7592,  2088,  1010,  2129,  2024,  2017,   999,   102,     0,
             0,     0,     0,     0],
        [  101,  2023,  2003,  1037,  2936,  5537,  2000,  2156,  2065, 11687,
          4667,  2052,  2147,   102]])

In [63]:
input_ids= tokenizer([payload,"this is a longer sequence to see if padding would work"])
val = input_ids,input_ids.gt(0).int()

In [91]:
import torch
with torch.no_grad():
  pt_res = model(*val)
  trfs_res = model(**trfs_ex)

assert pt_res.last_hidden_state.shape == trfs_res.last_hidden_state.shape
assert torch.allclose(pt_res.last_hidden_state,trfs_res.last_hidden_state)

In [171]:
from torch import nn
from typing import Any, List, Optional, Union,Dict

class PTBertTokenizer(nn.Module):
  def __init__(self,vocab_file_path=None,do_lower_case=True,bos_idx=101,eos_idx=102,padding_value=0):
    super().__init__()
    self.tokenizer=T.Sequential(
      T.BERTTokenizer(vocab_path=vocab_file_path, do_lower_case=do_lower_case, return_tokens=False),
      T.StrToIntTransform(),
      T.Truncate(max_seq_len - 2),
      T.AddToken(token=bos_idx, begin=True),
      T.AddToken(token=eos_idx, begin=False),
      T.ToTensor(padding_value=padding_value)
  )

  def forward(self,input:Union[str,List[str]])->Dict[str,torch.Tensor]:
    input_ids = self.tokenizer(input)
    # shape tensor to matching format for transformers model
    input_ids = torch.reshape(input_ids, (1,input_ids.shape[-1]))
    return {'input_ids':input_ids,"attention_mask":input_ids.gt(0).to(torch.int64)}

  


In [137]:
traced_tokenizer = torch.jit.trace(pt_tokenizer, "test")


RuntimeError: 
Module 'BERTTokenizer' has no attribute 'bert_model' (This attribute exists on the Python module, but we failed to convert Python type: 'torchtext._torchtext.BERTEncoder' to a TorchScript type. Only tensors and (possibly nested) tuples of tensors, lists, or dictsare supported as inputs or outputs of traced functions, but instead got value of type BERTEncoder.. Its type was inferred; try adding a type annotation for the attribute.):
  File "/home/ubuntu/miniconda3/envs/optimum/lib/python3.8/site-packages/torchtext/transforms.py", line 603
    def _batch_encode(self, text: List[str]) -> List[List[str]]:
        """Batch version of _encode i.e operate on list of str"""
        token_ids: List[List[int]] = self.bert_model.batch_encode([t.strip() for t in text])
                                     ~~~~~~~~~~~~~~~ <--- HERE
        tokens_ids_str: List[List[str]] = [[str(t) for t in token_id] for token_id in token_ids]
        return tokens_ids_str


In [138]:
bert_base_uncased_vocab_file = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
import torchtext.transforms as T
from torchtext.utils import get_asset_local_path
# Instantiate tokenizer with lower case, and return tokens=True (we also support return token IDs instead)
bert_tokenizer = T.BERTTokenizer(get_asset_local_path(bert_base_uncased_vocab_file),
                                    do_lower_case=True, strip_accents=None, return_tokens=True)

traced_tokenizer = torch.jit.trace(bert_tokenizer, "test")

RuntimeError: 
Module 'BERTTokenizer' has no attribute 'bert_model' (This attribute exists on the Python module, but we failed to convert Python type: 'torchtext._torchtext.BERTEncoder' to a TorchScript type. Only tensors and (possibly nested) tuples of tensors, lists, or dictsare supported as inputs or outputs of traced functions, but instead got value of type BERTEncoder.. Its type was inferred; try adding a type annotation for the attribute.):
  File "/home/ubuntu/miniconda3/envs/optimum/lib/python3.8/site-packages/torchtext/transforms.py", line 603
    def _batch_encode(self, text: List[str]) -> List[List[str]]:
        """Batch version of _encode i.e operate on list of str"""
        token_ids: List[List[int]] = self.bert_model.batch_encode([t.strip() for t in text])
                                     ~~~~~~~~~~~~~~~ <--- HERE
        tokens_ids_str: List[List[str]] = [[str(t) for t in token_id] for token_id in token_ids]
        return tokens_ids_str


## E2e Model

In [172]:
from torch import nn
from typing import Any, List, Optional, Union,Dict
from transformers import AutoModelForSequenceClassification
import torchtext.transforms as T

VOCAB_FILE="https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/vocab.txt"

class E2EModel(nn.Module):
  def __init__(self,model_id=None):
    super().__init__()
    self.tokenizer=PTBertTokenizer(VOCAB_FILE)
    self.model = AutoModelForSequenceClassification.from_pretrained(model_id)

  def forward(self,inputs:Union[str,List[str]]) -> Dict[str,torch.Tensor]:
      tokenized = self.tokenizer(inputs)
      print(tokenized)
      return self.model(**tokenized)


In [178]:
pipe = E2EModel("distilbert-base-uncased-finetuned-sst-2-english")
pipe("I like you")

{'input_ids': tensor([[ 101, 1045, 2066, 2017,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


SequenceClassifierOutput(loss=None, logits=tensor([[-4.2960,  4.6485]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

## E2E Classification pipeline 

In [3]:
from typing import List, Union, Dict, Any
import torch
from torch import nn
from torchtext import transforms as T
from transformers import AutoModelForSequenceClassification

class PTBertTokenizer(nn.Module):
  def __init__(self,vocab_file_path=None,do_lower_case=True,bos_idx=101,eos_idx=102,padding_value=0,max_seq_len=512):
    super().__init__()
    self.tokenizer=T.Sequential(
      T.BERTTokenizer(vocab_path=vocab_file_path, do_lower_case=do_lower_case, return_tokens=False),
      T.StrToIntTransform(),
      T.Truncate(max_seq_len - 2),
      T.AddToken(token=bos_idx, begin=True),
      T.AddToken(token=eos_idx, begin=False),
      T.ToTensor(padding_value=padding_value)
  )

  def forward(self,input:Union[str,List[str]])->Dict[str,torch.Tensor]:
    input_ids = self.tokenizer(input)
    # shape tensor to matching format for transformers model
    if input_ids.dim() == 1:
      input_ids = torch.reshape(input_ids, (1,input_ids.shape[-1]))
    return {'input_ids':input_ids,"attention_mask":input_ids.gt(0).to(torch.int64)}

  @classmethod
  def from_pretrained(cls, model_id: str):
    remote_file=f"https://huggingface.co/{model_id}/resolve/main/vocab.txt"
    return cls(vocab_file_path=remote_file)


class E2ETextClassification(nn.Module):
  def __init__(self,model_id=None):
    super().__init__()
    self.tokenizer=PTBertTokenizer.from_pretrained(model_id)
    self.model = AutoModelForSequenceClassification.from_pretrained(model_id)

  def forward(self,inputs:Union[str,List[str]]) -> List[Dict[str,Any]]:
      # preprocessing
      tokenized = self.tokenizer(inputs)
      with torch.no_grad():
        logits = self.model(**tokenized).logits
        scores=nn.Softmax(dim=-1)(logits)
      # post processing
      return [{"label": self.model.config.id2label[score.argmax().item()], "score": score.max().item()} for score in scores]

pipe = E2ETextClassification("distilbert-base-uncased-finetuned-sst-2-english")
scores = pipe(["I like you","i hate you very much"])
print(scores)
scores = pipe("I like you")
print(scores)      

[{'label': 'POSITIVE', 'score': 0.9998695850372314}, {'label': 'NEGATIVE', 'score': 0.9991033673286438}]
[{'label': 'POSITIVE', 'score': 0.9998695850372314}]


In [4]:
torch.save(pipe, "pipe.pt")
loaded_pipe=torch.load("pipe.pt")
loaded_pipe("it is so awesome that i can load and save the whole pipeline")


[{'label': 'POSITIVE', 'score': 0.9998151659965515}]