In [14]:
import torch
x = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],
    [0.0, 2.0, 0.0, 2.0],
    [1.0, 1.0, 1.0, 1.0],
])
w_query = torch.tensor([
    [1.0, 0.0, 1.0],
    [1.0, 0.0, 0.0],
    [0.0, 0.0, 1.0],
    [0.0, 1.0, 1.0]
])
w_key = torch.tensor([
    [0.0, 0.0, 1.0],
    [1.0, 1.0, 0.0],
    [0.0, 1.0, 0.0],
    [1.0, 1.0, 0.0]
])
w_value = torch.tensor([
    [0.0, 2.0, 0.0],
    [0.0, 3.0, 0.0],
    [1.0, 0.0, 3.0],
    [1.0, 1.0, 0.0]
])

In [17]:
## 쿼리, 키, 밸류 만들기
keys = torch.matmul(x, w_key)
querys = torch.matmul(x, w_query)
values = torch.matmul(x, w_value)

In [18]:
## 어텐션 스코어 만들기
attn_scores = torch.matmul(querys, keys.T)

In [19]:
attn_scores

tensor([[ 2.,  4.,  4.],
        [ 4., 16., 12.],
        [ 4., 12., 10.]])

In [20]:
## 소프트맥스 확률값 만들기
import numpy as np
from torch.nn.functional import softmax
key_dim_sqrt = np.sqrt(keys.shape[-1])
attn_probs = softmax(attn_scores / key_dim_sqrt, dim=1)

In [21]:
attn_probs

tensor([[1.3613e-01, 4.3194e-01, 4.3194e-01],
        [8.9045e-04, 9.0884e-01, 9.0267e-02],
        [7.4449e-03, 7.5471e-01, 2.3785e-01]])

In [22]:
## 소프트맥스 확률과 밸류를 가중합하기
weighted_values = torch.matmul(attn_probs, values)

In [23]:
weighted_values

tensor([[1.8639, 6.3194, 1.7042],
        [1.9991, 7.8141, 0.2735],
        [1.9926, 7.4796, 0.7359]])

In [29]:
## 피드포워드 뉴럴 네트워크 계산 예시1
x = torch.tensor([2, 1])
w1 = torch.tensor([[3, 2, -4], [2, -3, 1]])
b1 = 1
w2 = torch.tensor([[[-1, 1], [1, 2], [3, 1]]])
b2 = -1

In [30]:
## 피드포워드 뉴럴 네트워크 계산 예시2
h_preact = torch.matmul(x, w1) + b1
h = torch.nn.functional.relu(h_preact)
y = torch.matmul(h, w2) + b2

In [31]:
h_preact

tensor([ 9,  2, -6])

In [32]:
h

tensor([9, 2, 0])

In [33]:
y

tensor([[-8, 12]])

In [34]:
## 코드 레이어 정규화 예시
input = torch.tensor([[1.0, 2.0, 3.0], [1.0, 1.0, 1.0]])
m = torch.nn.LayerNorm(input.shape[-1])
output = m(input)
output

tensor([[-1.2247,  0.0000,  1.2247],
        [ 0.0000,  0.0000,  0.0000]], grad_fn=<NativeLayerNormBackward0>)

In [35]:
m.weight

Parameter containing:
tensor([1., 1., 1.], requires_grad=True)

In [36]:
m.bias

Parameter containing:
tensor([0., 0., 0.], requires_grad=True)

In [37]:
## 드롭아웃
m = torch.nn.Dropout(p=0.2)
input = torch.randn(1, 10)
output = m(input)
input

tensor([[ 0.0557, -0.1919, -1.4602, -0.6112, -0.1932, -1.3243, -1.9542, -0.9374,
          1.1059,  0.1313]])

In [38]:
output

tensor([[ 0.0697, -0.0000, -1.8253, -0.7640, -0.2414, -1.6554, -2.4428, -0.0000,
          1.3824,  0.0000]])

In [39]:
# 아담 옵티마이저
#from torch.optim import Adam
#optimizer = Adam(model.parameters(), lr=model.learning_rate)

In [40]:
## 토크나이저 선언
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    "beomi/kcbert-base",
    do_lower_case=False,
)

Downloading:   0%|          | 0.00/250k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

In [41]:
## 모델 선언
from transformers import BertConfig, BertModel
pretrained_model_config = BertConfig.from_pretrained(
    "beomi/kcbert-base"
)
model = BertModel.from_pretrained(
    "beomi/kcbert-base",
    config=pretrained_model_config,
)

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
pretrained_model_config

BertConfig {
  "_name_or_path": "beomi/kcbert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 300,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

In [43]:
## 입력값 만들기
sentences = ["안녕하세요", "하이!"]
features = tokenizer(
    sentences,
    max_length=10,
    padding="max_length",
    truncation=True,
)

In [44]:
features

{'input_ids': [[2, 19017, 8482, 3, 0, 0, 0, 0, 0, 0], [2, 15830, 5, 3, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]}

In [45]:
## 피처를 토치 텐서로 변환
features = {k: torch.tensor(v) for k, v in features.items()}

In [46]:
## 임베딩 계산하기
outputs = model(**features)

In [47]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.6969, -0.8248,  1.7512,  ..., -0.3732,  0.7399,  1.1907],
         [-1.4803, -0.4398,  0.9444,  ..., -0.7405, -0.0211,  1.3064],
         [-1.4299, -0.5033, -0.2069,  ...,  0.1285, -0.2611,  1.6057],
         ...,
         [-1.4406,  0.3431,  1.4043,  ..., -0.0565,  0.8450, -0.2170],
         [-1.3625, -0.2404,  1.1757,  ...,  0.8876, -0.1054,  0.0734],
         [-1.4244,  0.1518,  1.2920,  ...,  0.0245,  0.7572,  0.0080]],

        [[ 0.9371, -1.4749,  1.7351,  ..., -0.3426,  0.8050,  0.4031],
         [ 1.6095, -1.7269,  2.7936,  ...,  0.3100, -0.4787, -1.2491],
         [ 0.4861, -0.4569,  0.5712,  ..., -0.1769,  1.1253, -0.2756],
         ...,
         [ 1.2362, -0.6181,  2.0906,  ...,  1.3677,  0.8132, -0.2742],
         [ 0.5409, -0.9652,  1.6237,  ...,  1.2395,  0.9185,  0.1782],
         [ 1.9001, -0.5859,  3.0156,  ...,  1.4967,  0.1924, -0.4448]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_ou

In [48]:
outputs.pooler_output

tensor([[-0.1594,  0.0547,  0.1101,  ...,  0.2684,  0.1596, -0.9828],
        [-0.9221,  0.2969, -0.0110,  ...,  0.4291,  0.0311, -0.9955]],
       grad_fn=<TanhBackward0>)