# [2주차] 심화과제: Multi-head Attention으로 감정 분석 모델 구현하기

In [1]:
%pip install datasets sacremoses



In [2]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


train_ds = load_dataset("stanfordnlp/imdb", split="train")
test_ds = load_dataset("stanfordnlp/imdb", split="test")

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')


def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch:
    labels.append(row['label']) # 레이블 추가 (긍정/부정)
    texts.append(row['text'])  # 텍스트 추가 (영화 리뷰)

  texts = torch.LongTensor(tokenizer(texts, padding=True, truncation=True, max_length=max_len).input_ids)
  labels = torch.LongTensor(labels)

  return texts, labels


train_loader = DataLoader(
    train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


## Multi-head Attention

Multi-head Attention은 Transformer 모델의 핵심 구성 요소 중 하나로, 여러 개의 어텐션 헤드를 사용하여 입력 시퀀스의 다양한 부분 간의 상호작용을 동시에 학습할 수 있게 한다.
각 어텐션 헤드는 입력 시퀀스를 독립적으로 처리하여 서로 다른 표현 공간에서의 상호작용을 학습한다.
이를 통해 모델은 입력 시퀀스의 다양한 부분 간의 복잡한 관계를 더 잘 이해할 수 있게 된다.


In [3]:
from torch import nn
from math import sqrt

class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, d_model, n_heads):
        super().__init__()

        # d_model이 n_heads로 나누어 떨어지는지 확인
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.input_dim = input_dim
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_head = d_model // n_heads  # D'

        self.wq = nn.Linear(input_dim, d_model)
        self.wk = nn.Linear(input_dim, d_model)
        self.wv = nn.Linear(input_dim, d_model)
        # multi head attention의 마지막 단계에서 여러 헤드에서 나온 정보를
        # 다시 원래의 차원으로 투영하는 역할을 한다.
        self.dense = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask):
        batch_size = x.size(0)
        seq_length = x.size(1)

        # 1. Q, K, V 생성 (기존과 동일)
        q, k, v = self.wq(x), self.wk(x), self.wv(x)  # (B, S, D)

        # 2. Q, K, V를 head 수만큼 분할하여 reshape
        q = q.view(batch_size, seq_length, self.n_heads, self.d_head)  # (B, S, H, D')
        k = k.view(batch_size, seq_length, self.n_heads, self.d_head)
        v = v.view(batch_size, seq_length, self.n_heads, self.d_head)

        # 3. Transpose하여 head 차원을 앞으로
        q = q.transpose(1, 2)  # (B, H, S, D')
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # 4. Attention Score 계산
        score = torch.matmul(q, k.transpose(-1, -2))  # (B, H, S, S)
        score = score / sqrt(self.d_head)  # D'로 나누기

        # 5. Mask 적용 (mask 차원 확장)
        if mask is not None:
            # mask: (B, 1, S) -> (B, 1, 1, S)
            mask = mask.unsqueeze(1)
            score = score + (mask * -1e9)

        # 6. Attention 계산 및 원래 shape으로 복원
        score = self.softmax(score)
        result = torch.matmul(score, v)  # (B, H, S, D')

        # Transpose 및 Reshape
        result = result.transpose(1, 2)  # (B, S, H, D')
        result = result.contiguous().view(batch_size, seq_length, self.d_model)  # (B, S, D)

        # 7. Output projection
        result = self.dense(result)

        return result

In [4]:
class TransformerLayer(nn.Module):
  def __init__(self, input_dim, d_model, dff, n_heads, dropout_rate):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model
    self.dff = dff
    self.n_heads = n_heads

    # SelfAttention을 MultiHeadAttention으로 변경
    self.mha = MultiHeadAttention(input_dim, d_model, n_heads)

    # FFN은 그대로 유지
    self.ffn = nn.Sequential(
        nn.Linear(d_model, dff),
        nn.ReLU(),
        nn.Linear(dff, d_model)
    )

    # Layer Normalization
    self.layer_norm1 = nn.LayerNorm(d_model)
    self.layer_norm2 = nn.LayerNorm(d_model)

    # Dropout
    self.dropout1 = nn.Dropout(dropout_rate)
    self.dropout2 = nn.Dropout(dropout_rate)

  def forward(self, x, mask):
    # Multi-Head Attention with residual connection and layer norm
    attn_output = self.mha(x, mask)
    attn_output = self.dropout1(attn_output)
    out1 = self.layer_norm1(attn_output + x)

    # Feed Forward with residual connection and layer norm
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output)
    out2 = self.layer_norm2(ffn_output + out1)

    return out2

## Positional encoding

In [5]:
import numpy as np


def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)


max_len = 400
print(positional_encoding(max_len, 256).shape)

torch.Size([1, 400, 256])


In [6]:
class TextClassifier(nn.Module):
  def __init__(self, vocab_size, d_model, n_layers, dff, n_heads, dropout_rate):
    super().__init__()

    self.vocab_size = vocab_size
    self.d_model = d_model
    self.n_layers = n_layers
    self.dff = dff
    self.n_heads = n_heads
    self.dropout_rate = dropout_rate

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = nn.parameter.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
    self.layers = nn.ModuleList([TransformerLayer(d_model, d_model, dff, n_heads, dropout_rate) for _ in range(n_layers)])
    self.classification = nn.Linear(d_model, 1)

  def forward(self, x):
    mask = (x == tokenizer.pad_token_id)
    mask = mask[:, None, :]
    seq_len = x.shape[1]

    x = self.embedding(x)
    x = x * sqrt(self.d_model)
    x = x + self.pos_encoding[:, :seq_len]

    for layer in self.layers:
      x = layer(x, mask)

    x = x[:, 0]
    x = self.classification(x)

    return x


model = TextClassifier(len(tokenizer), 32, 4, 32, 4, 0.1)

## 학습

In [7]:
from torch.optim import Adam

device = torch.device("cuda")

lr = 0.001
model = model.to(device)
loss_fn = nn.BCEWithLogitsLoss()

optimizer = Adam(model.parameters(), lr=lr)

In [8]:
import numpy as np
import matplotlib.pyplot as plt


def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)

    preds = model(inputs)
    preds = (preds > 0).long()[..., 0]

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt

In [9]:
n_epochs = 20

for epoch in range(n_epochs):
  total_loss = 0.
  model.train()
  for data in train_loader:
    model.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device).float()

    preds = model(inputs)[..., 0]
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

  with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

Epoch   0 | Train Loss: 209.111279129982
Epoch   1 | Train Loss: 148.36042469739914
Epoch   2 | Train Loss: 118.54504172503948
Epoch   3 | Train Loss: 95.20489977300167
Epoch   4 | Train Loss: 74.2625348046422
Epoch   5 | Train Loss: 55.13052459806204
Epoch   6 | Train Loss: 41.68866502121091
Epoch   7 | Train Loss: 31.84616607055068
Epoch   8 | Train Loss: 26.017943282146007
Epoch   9 | Train Loss: 22.650948900263757
Epoch  10 | Train Loss: 20.734021531883627
Epoch  11 | Train Loss: 18.44123435812071
Epoch  12 | Train Loss: 17.392526387702674
Epoch  13 | Train Loss: 16.88445278443396
Epoch  14 | Train Loss: 15.766353387152776
Epoch  15 | Train Loss: 15.028951856307685
Epoch  16 | Train Loss: 13.238352596526965
Epoch  17 | Train Loss: 13.79516678955406
Epoch  18 | Train Loss: 14.393958314787596
Epoch  19 | Train Loss: 12.801313441479579
Epoch  20 | Train Loss: 10.297449073987082
Epoch  21 | Train Loss: 13.364861776586622
Epoch  22 | Train Loss: 12.540673050098121
Epoch  23 | Train Loss