## Model

In [1]:
from transformers import BertConfig, BertModel

# config(설정)을 만듭니다.
config = BertConfig()
# 해당 config에서 모델을 생성합니다.
model = BertModel(config)

In [2]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [3]:
from transformers import BertModel
# 사전학습 모델 호출
model = BertModel.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### 저장 메서드 (saving methods)

In [4]:
model.save_pretrained("saving_folder")

#### 추론 (inference)

In [5]:
sequences = ["Hello!","Cool.","Nice!"]

In [6]:
encoded_sequence =[
	[101, 7592, 999, 102],
	[101, 4658, 1012, 102],
	[101, 3835, 999, 102],
]

In [8]:
import torch
model_inputs = torch.tensor(encoded_sequence)
# 입력을 매개변수로 지정하여, 모델을 호출하면 됨
output = model(model_inputs)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 4.4496e-01,  4.8276e-01,  2.7797e-01,  ..., -5.4032e-02,
           3.9394e-01, -9.4770e-02],
         [ 2.4943e-01, -4.4093e-01,  8.1772e-01,  ..., -3.1917e-01,
           2.2992e-01, -4.1172e-02],
         [ 1.3668e-01,  2.2518e-01,  1.4502e-01,  ..., -4.6914e-02,
           2.8224e-01,  7.5566e-02],
         [ 1.1789e+00,  1.6739e-01, -1.8187e-01,  ...,  2.4671e-01,
           1.0441e+00, -6.1971e-03]],

        [[ 3.6436e-01,  3.2464e-02,  2.0258e-01,  ...,  6.0111e-02,
           3.2451e-01, -2.0995e-02],
         [ 7.1866e-01, -4.8725e-01,  5.1740e-01,  ..., -4.4012e-01,
           1.4553e-01, -3.7545e-02],
         [ 3.3223e-01, -2.3271e-01,  9.4876e-02,  ..., -2.5268e-01,
           3.2172e-01,  8.1093e-04],
         [ 1.2523e+00,  3.5754e-01, -5.1321e-02,  ..., -3.7840e-01,
           1.0526e+00, -5.6255e-01]],

        [[ 2.4042e-01,  1.4718e-01,  1.2110e-01,  ...,  7.6062e-02,
           3.3564e-01,  2

## Tokenizer

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer("Using a Transformer network is simple")

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

#### Tokenization

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [12]:
# token to ids
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [13]:
# decoding
decoded_string = tokenizer.decode([7993, 170, 13809, 23763, 2443, 1110, 3014])

print(decoded_string)

Using a Transformer network is simple


In [15]:
# Multiple Sequence Process
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
print('tokens:', tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print('ids:', ids)
input_ids = torch.tensor(ids)
print('input_ids:', input_ids)
# This line will fail
model(input_ids)

tokens: ['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
ids: [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
input_ids: tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
         2026,  2878,  2166,  1012])


In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [26]:
batch_ids = [ids, ids]
batch_ids

[[1045,
  1005,
  2310,
  2042,
  3403,
  2005,
  1037,
  17662,
  12172,
  2607,
  2026,
  2878,
  2166,
  1012],
 [1045,
  1005,
  2310,
  2042,
  3403,
  2005,
  1037,
  17662,
  12172,
  2607,
  2026,
  2878,
  2166,
  1012]]

In [27]:
batched_id = [
	[200, 200, 200],
	[200, 200]
]

In [28]:
batched_id = [
	[200, 200, 200],
	[200, 200]
]

#### Padding

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_ids = [
	[200, 200, 200],
	[200, 200, tokenizer.pad_token_id]
]

print(model(torch.tensor(sequence1_ids)).logits)
# tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
print(model(torch.tensor(sequence2_ids)).logits)
# tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
print(model(torch.tensor(batched_ids)).logits)
# tensor([[ 1.5694, -1.3895],
#        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

tensor([[-0.1268, -0.4641]], grad_fn=<AddmmBackward0>)
tensor([[-0.0853, -0.4605]], grad_fn=<AddmmBackward0>)
tensor([[-0.1268, -0.4641],
        [-0.1187, -0.4738]], grad_fn=<AddmmBackward0>)


In [30]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_ids = [
	[200, 200, 200],
	[200, 200, tokenizer.pad_token_id]
]

print(model(torch.tensor(sequence1_ids)).logits)
# tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
print(model(torch.tensor(sequence2_ids)).logits)
# tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
print(model(torch.tensor(batched_ids)).logits)
# tensor([[ 1.5694, -1.3895],
#        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

tensor([[ 0.0423, -0.1912]], grad_fn=<AddmmBackward0>)
tensor([[ 0.1317, -0.2037]], grad_fn=<AddmmBackward0>)
tensor([[ 0.0423, -0.1912],
        [ 0.0383, -0.1713]], grad_fn=<AddmmBackward0>)


#### Attention Mask

In [31]:
batch_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batch_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)
# tensor([[ 1.5694, -1.3895],
#        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

tensor([[ 0.0423, -0.1912],
        [ 0.1317, -0.2037]], grad_fn=<AddmmBackward0>)


In [32]:
max_sequence_length = 512

sequence = sequence[:max_sequence_length]

#### Special Tokens

In [33]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# 모델 최대 길이(model max length)보다 긴 시퀀스를 자릅니다.
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# 지정된 최대 길이보다 긴 시퀀스를 자릅니다.
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [34]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# 모델 최대 길이(model max length)보다 긴 시퀀스를 자릅니다.
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# 지정된 최대 길이보다 긴 시퀀스를 자릅니다.
model_inputs = tokenizer(sequences, max_length=8, truncation=True)