<a href="https://colab.research.google.com/github/marha-hwang/llm_study/blob/master/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 띄어쓰기 단위로 분리
input_text = "나는 최근 파리 여행을 다녀왔다"
input_text_list = input_text.split()
print("input_text_list: ", input_text_list)

# 토큰 -> 아이디 딕셔너리와 아이디 -> 토큰 딕셔너리 만들기
str2idx = {word:idx for idx, word in enumerate(input_text_list)}
idx2str = {idx:word for idx, word in enumerate(input_text_list)}
print("str2idx: ", str2idx)
print("idx2str: ", idx2str)

# 토큰을 토큰 아이디로 변환
input_ids = [str2idx[word] for word in input_text_list]
print("input_ids: ", input_ids)

input_text_list:  ['나는', '최근', '파리', '여행을', '다녀왔다']
str2idx:  {'나는': 0, '최근': 1, '파리': 2, '여행을': 3, '다녀왔다': 4}
idx2str:  {0: '나는', 1: '최근', 2: '파리', 3: '여행을', 4: '다녀왔다'}
input_ids:  [0, 1, 2, 3, 4]


In [8]:
import torch
import torch.nn as nn

embedding_dim = 16
embed_layer = nn.Embedding(len(str2idx), embedding_dim)

input_embeddings = embed_layer(torch.tensor(input_ids)) # (5, 16)
input_embeddings = input_embeddings.unsqueeze(0) # (1, 5, 16)


input_embeddings.shape

tensor([[[ 1.3727, -0.0508,  0.4007, -0.8596, -0.2245, -1.0912,  0.7360,
           0.8636,  0.7387, -1.1510, -1.4466,  1.3345,  0.5277, -0.2673,
           0.9286, -0.7230],
         [ 0.7206, -0.8104,  0.7783,  0.2679, -1.8342, -0.8210,  0.6667,
           0.5271, -0.4010, -0.6730,  1.4353, -0.5447,  0.0603,  0.6748,
          -0.4917, -2.0448],
         [ 0.5705, -1.1236,  2.2578,  1.0732,  0.9077, -0.2606, -1.7578,
          -1.1954, -1.3003, -0.0756,  0.5767,  0.8461, -0.5217, -0.8787,
           0.7395, -0.2119],
         [-1.2143,  0.4815,  0.5606,  0.5548, -0.2083, -0.1828, -0.2618,
          -0.3724, -0.7266,  0.0573,  0.2720,  1.0197, -0.6294, -0.7825,
          -0.0986,  0.5066],
         [ 0.2251,  0.0348,  1.1538, -0.0355,  0.6677,  1.0349,  0.7457,
           0.4727, -0.5410, -1.6764,  1.3573,  0.4374, -0.6730,  0.7946,
           1.6037, -1.4407]]], grad_fn=<UnsqueezeBackward0>)

In [15]:
embedding_dim = 16
max_position = 12
# 토큰 임베딩 층 생성
embed_layer = nn.Embedding(len(str2idx), embedding_dim)
# 위치 인코딩 층 생성
position_embed_layer = nn.Embedding(max_position, embedding_dim)

position_ids = torch.arange(len(input_ids), dtype=torch.long).unsqueeze(0)
position_encodings = position_embed_layer(position_ids)
token_embeddings = embed_layer(torch.tensor(input_ids)) # (5, 16)
token_embeddings = token_embeddings.unsqueeze(0) # (1, 5, 16)
# 토큰 임베딩과 위치 인코딩을 더해 최종 입력 임베딩 생성
input_embeddings = token_embeddings + position_encodings
input_embeddings.shape

torch.Size([1, 5, 16])

In [11]:
# 이전코드에서 토큰인코딩과 위치 인코딩값을 활용하여 임베딩값을 생성하였음
# embedding_dim을 Linear에 통과시켜 head_dim과 같은 차원으로 변환 시키기 위한 과정
# 아직까지는 무작위로 초기화됨
# 임베딩 벡터를 보강하여 문맥적의미를 포함하기 위한 과정

head_dim = 16

# 쿼리, 키, 값을 계산하기 위한 변환
weight_q = nn.Linear(embedding_dim, head_dim)
weight_k = nn.Linear(embedding_dim, head_dim)
weight_v = nn.Linear(embedding_dim, head_dim)
# 변환 수행
querys = weight_q(input_embeddings) # (1, 5, 16)
keys = weight_k(input_embeddings) # (1, 5, 16)
values = weight_v(input_embeddings) # (1, 5, 16)

In [12]:
from math import sqrt
import torch.nn.functional as F

def compute_attention(querys, keys, values, is_causal=False):
	dim_k = querys.size(-1) # 16
	scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k)
	weights = F.softmax(scores, dim=-1)
	return weights @ values

In [17]:
print("원본 입력 형태: ", input_embeddings.shape)

after_attention_embeddings = compute_attention(querys, keys, values)

print("어텐션 적용 후 형태: ", after_attention_embeddings.shape)
print(after_attention_embeddings)
# 원본 입력 형태:  torch.Size([1, 5, 16])
# 어텐션 적용 후 형태:  torch.Size([1, 5, 16])

원본 입력 형태:  torch.Size([1, 5, 16])
어텐션 적용 후 형태:  torch.Size([1, 5, 16])
tensor([[[-0.2289, -0.4645, -0.6182, -0.3747, -0.0192,  0.4864, -0.8260,
           0.2830,  0.2665, -0.0849, -0.0899,  0.4328, -0.5040,  0.2665,
           0.4967, -1.0866],
         [-0.2468, -0.2954, -0.4412, -0.3658, -0.1651,  0.1934, -0.7989,
           0.2826,  0.3279, -0.1429, -0.0914,  0.5138, -0.4130,  0.3678,
           0.4747, -1.2167],
         [ 0.2813, -0.6740, -0.1251, -0.4751, -0.0346,  0.4857, -0.6328,
           0.0218,  0.2873,  0.0352,  0.0419,  0.0499, -0.1024,  0.0890,
           0.5233, -0.1875],
         [ 0.2585, -0.4091,  0.2260, -0.3997, -0.2212,  0.0962, -0.6139,
           0.0313,  0.4716, -0.0281,  0.0399,  0.0109,  0.0546,  0.1823,
           0.4876, -0.2803],
         [-0.1863, -0.2750, -0.1128, -0.3057, -0.1687,  0.1570, -0.7412,
           0.1900,  0.4724, -0.0942, -0.0521,  0.1806, -0.2304,  0.3027,
           0.4737, -0.8491]]], grad_fn=<UnsafeViewBackward0>)


In [14]:
class AttentionHead(nn.Module):
  def __init__(self, token_embed_dim, head_dim, is_causal=False):
    super().__init__()
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, head_dim) # 쿼리 벡터 생성을 위한 선형 층
    self.weight_k = nn.Linear(token_embed_dim, head_dim) # 키 벡터 생성을 위한 선형 층
    self.weight_v = nn.Linear(token_embed_dim, head_dim) # 값 벡터 생성을 위한 선형 층

  def forward(self, querys, keys, values):
    outputs = compute_attention(
        self.weight_q(querys),  # 쿼리 벡터
        self.weight_k(keys),    # 키 벡터
        self.weight_v(values),  # 값 벡터
        is_causal=self.is_causal
    )
    return outputs

attention_head = AttentionHead(embedding_dim, embedding_dim)
after_attention_embeddings = attention_head(input_embeddings, input_embeddings, input_embeddings)

In [None]:
class MultiheadAttention(nn.Module):
  def __init__(self, token_embed_dim, d_model, n_head, is_causal=False):
    super().__init__()
    self.n_head = n_head
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, d_model)
    self.weight_k = nn.Linear(token_embed_dim, d_model)
    self.weight_v = nn.Linear(token_embed_dim, d_model)
    self.concat_linear = nn.Linear(d_model, d_model)

  def forward(self, querys, keys, values):
    B, T, C = querys.size()
    querys = self.weight_q(querys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    keys = self.weight_k(keys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    values = self.weight_v(values).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    attention = compute_attention(querys, keys, values, self.is_causal)
    output = attention.transpose(1, 2).contiguous().view(B, T, C)
    output = self.concat_linear(output)
    return output

n_head = 4
mh_attention = MultiheadAttention(embedding_dim, embedding_dim, n_head)
after_attention_embeddings = mh_attention(input_embeddings, input_embeddings, input_embeddings)
after_attention_embeddings.shape