In [1]:
# pip install transformers

###  Test transformers


In [2]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis')

classifier(
    ["i like the tea",
    "i wish i could like you, but i can`t"]
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9996429681777954},
 {'label': 'NEGATIVE', 'score': 0.9953327775001526}]

## Tokeniztion to ID use Autotokenizer

In [3]:
from transformers import AutoTokenizer

# select models in https://huggingface.co/
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'  
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
inputs_rows=[
    "i like the tea",
    "i wish i could like you, but i can`t"
]
# padding and truncation, return pytorch`s tensor   (max 512 words)
inputs=tokenizer(inputs_rows, padding=True, truncation=True, return_tensors='pt')  
print(inputs)

{'input_ids': tensor([[ 101, 1045, 2066, 1996, 5572,  102,    0,    0,    0,    0,    0,    0,
            0,    0],
        [ 101, 1045, 4299, 1045, 2071, 2066, 2017, 1010, 2021, 1045, 2064, 1036,
         1056,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [5]:
# token to words
tokenizer.decode([ 101, 1045, 4299, 1045, 2071, 2066, 2017, 1010, 2021, 1045, 2064, 1036,
         1056,  102])

'[CLS] i wish i could like you, but i can ` t [SEP]'

## Model 

In [6]:
from transformers import AutoModel

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'  
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [8]:
outputs=model(**inputs)

# 2 Sentences, Sentences with max 14 tokens , 768 features for 1 token
print(outputs.last_hidden_state.shape)
print(outputs.last_hidden_state)

torch.Size([2, 14, 768])
tensor([[[ 0.1078,  0.0039,  0.4571,  ...,  0.7474,  1.0293,  0.0603],
         [ 0.5334,  0.1815,  0.4580,  ...,  0.6635,  0.9365,  0.0191],
         [ 0.4604,  0.1575,  0.5864,  ...,  0.5963,  0.8522,  0.1291],
         ...,
         [-0.0523,  0.0260,  0.3290,  ...,  0.7557,  0.8860,  0.0487],
         [-0.1452, -0.0803,  0.3396,  ...,  0.8775,  0.9491,  0.0655],
         [-0.1722, -0.0958,  0.3778,  ...,  0.8600,  0.9213,  0.0477]],

        [[-0.4830,  0.4751,  0.1821,  ...,  0.1666,  0.1407,  0.4888],
         [ 0.2383,  1.2855, -0.1179,  ...,  0.5810,  0.5428,  0.9192],
         [ 0.1262,  1.0088, -0.0829,  ..., -0.0414,  0.3643,  0.5914],
         ...,
         [ 0.1188,  0.4452,  1.1468,  ..., -0.1102, -0.0286, -0.3943],
         [-0.5120,  0.3615,  0.3537,  ...,  0.0721,  0.3629,  0.2286],
         [ 0.0720,  1.1310, -0.0359,  ...,  0.1025,  0.3989,  0.0664]]],
       grad_fn=<NativeLayerNormBackward0>)


## AutoModelForSequenceClassification

In [9]:
from transformers import AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'  
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs=model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


In [10]:
# AutoModelForSequenceClassification with more 2 full connect layers compare with AutoModel
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

## AutoModelForSequenceClassification add softmax with pytorch

In [11]:
#with softmax
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[3.5703e-04, 9.9964e-01],
        [9.9533e-01, 4.6672e-03]], grad_fn=<SoftmaxBackward0>)


In [12]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}