# 1. tokenizer -> construct input

- tokenizer, model: match -> tokenizer outputs == model input
- Auto\*Tokenizer, AutoModel\*：Generic type

In [1]:
test_senteces = ['today is not that bad', 'today is so bad']
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## tokenizer
- tokenizer：serves model input
    - len(input_ids) == len(attention_mask)
    - tokenizer(test_senteces[0], ): tokenizer.\_\_call\_\_：encode
    - tokenizer.encode == tokenizer.tokenize + tokenizer.convert_tokens_to_ids
    - tokenizer.decode
    - Principle of tokenizer is tokenizer.vocab stores token => id mapping relationship.
        - tokenizer.special_tokens_map
    - attention mask and padding match

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [4]:
batch_input = tokenizer(test_senteces, truncation=True, padding=True, return_tensors='pt')

In [5]:
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],
        [ 101, 2651, 2003, 2061, 2919,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0]])}

In [6]:
tokenizer(test_senteces[0], )

{'input_ids': [101, 2651, 2003, 2025, 2008, 2919, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [7]:
tokenizer.encode(test_senteces[0], )

[101, 2651, 2003, 2025, 2008, 2919, 102]

In [8]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_senteces[0]))

[2651, 2003, 2025, 2008, 2919]

In [9]:
tokenizer.decode([101, 2651, 2003, 2025, 2008, 2919, 102])

'[CLS] today is not that bad [SEP]'

In [10]:
tokenizer.special_tokens_map.values()

dict_values(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'])

In [11]:
tokenizer.convert_tokens_to_ids([special for special in tokenizer.special_tokens_map.values()])

[100, 102, 0, 101, 103]

In [12]:
batch_input = tokenizer(test_senteces, truncation=True, padding=True, return_tensors='pt')

# 2. model -> call model

In [13]:
import torch
import torch.nn.functional as F

## model.config
- find the func which convert id to label

In [14]:
model.config

DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.55.4",
  "vocab_size": 30522
}

In [17]:
with torch.no_grad():
    outputs = model(**batch_input)
    print(outputs)
    
    scores = F.softmax(outputs.logits, dim=1)
    print(scores)
    
    labels = torch.argmax(scores, dim=1)
    print(labels)
    
    labels = [model.config.id2label[id] for id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4620,  3.6118],
        [ 4.7508, -3.7899]]), hidden_states=None, attentions=None)
tensor([[8.4631e-04, 9.9915e-01],
        [9.9980e-01, 1.9531e-04]])
tensor([1, 0])
['POSITIVE', 'NEGATIVE']


# 3. parse output -> output parsing

In [18]:
with torch.no_grad():
    outputs = model(**batch_input)
    print(outputs)
    
    scores = F.softmax(outputs.logits, dim=1)
    print(scores)
    
    labels = torch.argmax(scores, dim=1)
    print(labels)
    
    labels = [model.config.id2label[id] for id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4620,  3.6118],
        [ 4.7508, -3.7899]]), hidden_states=None, attentions=None)
tensor([[8.4631e-04, 9.9915e-01],
        [9.9980e-01, 1.9531e-04]])
tensor([1, 0])
['POSITIVE', 'NEGATIVE']
