<a href="https://colab.research.google.com/github/once-upon-an-april/Thuc-Hanh-Deep-Learning-trong-Khoa-Hoc-Du-Lieu-DS201.Q11.1/blob/main/Bai4/22520975_Lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Bài 1: Xây dựng mô hình Transformer Encoder gồm 3 lớp theo mô tả trong nghiên cứu [Attention is all you need](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). Huấn luyện mô hình này cho bài toán phân loại domain câu bình luận trên bộ dữ liệu [UIT-ViOCD](https://drive.google.com/drive/folders/1Lu9axyLkw7dMx80uLRgvCnZsmNzhJWAa?usp=sharing).

In [None]:
from datasets import load_dataset

try:
  ds = load_dataset("tarudesu/ViOCD", trust_remote_code=True)

  print("Load dataset thành công!")
  print(ds)

  if 'train' in ds:
    print(f"Số lượng mẫu train: {len(ds['train'])}")
    print("Ví dụ mẫu đầu tiên:", ds['train'][0])

except Exception as e:
  print(f"Có lỗi xảy ra: {e}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_train = ds['train'].to_pandas()

# Hiển thị 5 mẫu ngẫu nhiên
print(df_train.sample(5)[['review', 'review_tokenize', 'domain', 'label']])

In [None]:
domain_counts = df_train['domain'].value_counts()

print("Phân bố dữ liệu:", domain_counts)

# Vẽ biểu đồ
plt.figure(figsize=(10, 6))
sns.barplot(x=domain_counts.index, y=domain_counts.values, palette='viridis')
plt.title('Số lượng mẫu theo từng domain')
plt.xticks(rotation=45)
plt.show()

In [None]:
df_train['length'] = df_train['review_tokenize'].apply(lambda x: len(str(x).split()))

print(df_train['length'].describe())

plt.figure(figsize=(10, 6))
plt.hist(df_train['length'], bins=50, color='skyblue', edgecolor='black')
plt.title('Phân phối độ dài câu')
plt.xlabel('Số từ')
plt.show()

In [None]:
print(f"Số lượng giá trị null:\n", df_train.isnull().sum())

empty_reviews = df_train[df_train['review_tokenize'].str.strip() == '']
print(f"Số lượng câu rỗng: {len(empty_reviews)}")

In [None]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=500):
    super(PositionalEncoding, self).__init__()

    # Tạo ma trận vị trí
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    self.register_buffer('pe', pe.unsqueeze(0))

  def forward(self, x):
    return x + self.pe[:, :x.size(1), :]

class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_head):
    super(MultiHeadAttention, self).__init__()
    assert d_model % n_head == 0

    self.d_k = d_model // n_head
    self.n_head = n_head
    self.d_model = d_model

    self.w_q = nn.Linear(d_model, d_model)
    self.w_k = nn.Linear(d_model, d_model)
    self.w_v = nn.Linear(d_model, d_model)
    self.fc_out = nn.Linear(d_model, d_model)

  def forward(self, q, k, v, mask=None):
    batch_size = q.size(0)

    Q = self.w_q(q).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
    K = self.w_k(k).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
    V = self.w_v(v).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)

    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

    if mask is not None:
      scores = scores.masked_fill(mask == 0, -1e9)

    attn_probs = torch.softmax(scores, dim=-1)

    output = torch.matmul(attn_probs, V)

    output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

    return self.fc_out(output)

class PositionwiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

  def forward(self, x):
        return self.fc2(self.dropout(F.relu(self.fc1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.mha(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))

        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))

        return x

class TransformerEncoderClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, n_layers, n_classes, max_len=256, dropout=0.1):
        super(TransformerEncoderClassifier, self).__init__()

        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)

        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_head, d_ff=d_model*4, dropout=dropout)
            for _ in range(n_layers)
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, n_classes)

    def forward(self, x, mask=None):
        x = self.embedding(x) * math.sqrt(self.d_model) # Scaling embedding theo paper
        x = self.pos_encoder(x)
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        x = x.mean(dim=1)

        return self.fc_out(x)

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from sklearn.metrics import classification_report
import torch.optim as optim

# Xây dựng Vocabulary từ tập train
def build_vocab(dataset):
    all_tokens = []
    for text in dataset['review_tokenize']:
        all_tokens.extend(text.lower().split())

    token_counts = Counter(all_tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for token, count in token_counts.items():
        if count >= 2:
            vocab[token] = len(vocab)
    return vocab

vocab = build_vocab(ds['train'])
print(f"Kích thước từ điển: {len(vocab)}")

# Label Map (Domain -> Index)
domains = ['app', 'fashion', 'cosmetic', 'mobile']
label_map = {domain: i for i, domain in enumerate(domains)}
print(f"Label Map: {label_map}")

# Hàm xử lý Batch
MAX_LEN = 256
def collate_batch(batch):
    label_list, text_list = [], []
    for _item in batch:
        label_list.append(label_map[_item['domain']])

        tokens = [vocab.get(token, vocab['<UNK>']) for token in _item['review_tokenize'].lower().split()]

        if len(tokens) > MAX_LEN:
            tokens = tokens[:MAX_LEN]

        text_list.append(torch.tensor(tokens, dtype=torch.long))

    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab['<PAD>'])
    label_list = torch.tensor(label_list, dtype=torch.long)

    return text_list, label_list

# Tạo DataLoader
BATCH_SIZE = 64 # Tăng lên 64 vì câu ngắn (mean=27)
train_loader = DataLoader(ds['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(ds['validation'], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(ds['test'], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Khởi tạo mô hình
model = TransformerEncoderClassifier(
    vocab_size=len(vocab),
    d_model=128,
    n_head=4,
    n_layers=3,
    n_classes=len(domains),
    max_len=MAX_LEN,
    dropout=0.1
).to(device)

# Tính toán Class Weights để xử lý mất cân bằng dữ liệu
class_weights = torch.tensor([0.68, 0.80, 1.05, 3.02], dtype=torch.float).to(device)

criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=1e-4) # Learning rate nhỏ cho Transformer

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for text, labels in loader:
        text, labels = text.to(device), labels.to(device)

        # Tạo mask cho phần padding (PAD = 0)
        # Mask shape: (Batch, 1, 1, Seq_Len)
        mask = (text != vocab['<PAD>']).unsqueeze(1).unsqueeze(2).to(device)

        optimizer.zero_grad()
        output = model(text, mask)
        loss = criterion(output, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Hàm đánh giá
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    preds, targets = [], []

    with torch.no_grad():
        for text, labels in loader:
            text, labels = text.to(device), labels.to(device)
            mask = (text != vocab['<PAD>']).unsqueeze(1).unsqueeze(2).to(device)

            output = model(text, mask)
            loss = criterion(output, labels)
            total_loss += loss.item()

            pred = torch.argmax(output, dim=1)
            preds.extend(pred.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    return total_loss / len(loader), preds, targets

NUM_EPOCHS = 10
print(f"Bắt đầu huấn luyện trên {device}...")

for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, _, _ = evaluate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

test_loss, preds, targets = evaluate(model, test_loader, criterion)
print("\n=== KẾT QUẢ TRÊN TẬP TEST ===")
print(classification_report(targets, preds, target_names=domains))

### Bài 2: Xây dựng mô hình Transformer Encoder gồm 3 lớp theo mô tả trong nghiên cứu [Attention is all you need](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). Huấn luyện mô hình này cho bài toán gán nhãn chuỗi trên bộ dữ liệu [PhoNERT](https://github.com/VinAIResearch/PhoNER_COVID19).

In [None]:
import pandas as pd
import requests
import json

def load_phoner_data(url):
    response = requests.get(url)
    data = []
    for line in response.text.strip().split('\n'):
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue
    df = pd.DataFrame.from_dict(data)
    return df

print("Đang tải dữ liệu PhoNERT...")
base_url = "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/"

train_df = load_phoner_data(base_url + "train_syllable.json")
val_df = load_phoner_data(base_url + "dev_syllable.json")
test_df = load_phoner_data(base_url + "test_syllable.json")

print(f"Số lượng mẫu Train: {len(train_df)}")
print(f"Số lượng mẫu Val: {len(val_df)}")
print(f"Số lượng mẫu Test: {len(test_df)}")

print("\n--- Mẫu dữ liệu đầu tiên ---")
print("Câu:", train_df.iloc[0]['words'])
print("Nhãn:", train_df.iloc[0]['tags'])

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

# 1. Xây dựng Vocabulary
def build_vocab(text_lists):
    all_tokens = [token.lower() for seq in text_lists for token in seq]
    token_counts = Counter(all_tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for token, count in token_counts.items():
        if count >= 2:
            vocab[token] = len(vocab)
    return vocab

vocab = build_vocab(train_df['words'])
print(f"Kích thước từ điển: {len(vocab)}")

# 2. Xây dựng Tag Map
# Lấy tất cả các nhãn xuất hiện trong tập train
all_tags = sorted(list(set([tag for seq in train_df['tags'] for tag in seq])))
tag_map = {tag: i for i, tag in enumerate(all_tags)}
print(f"Số lượng nhãn ({len(tag_map)}): {tag_map}")

# 3. Dataset Class
class NERDataset(Dataset):
    def __init__(self, df, vocab, tag_map):
        self.df = df
        self.vocab = vocab
        self.tag_map = tag_map

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        words = self.df.iloc[idx]['words']
        tags = self.df.iloc[idx]['tags']

        # Chuyển từ -> index
        word_ids = [self.vocab.get(w.lower(), self.vocab['<UNK>']) for w in words]
        # Chuyển tag -> index
        tag_ids = [self.tag_map[t] for t in tags]

        return torch.tensor(word_ids, dtype=torch.long), torch.tensor(tag_ids, dtype=torch.long)

# 4. Collate Function (Xử lý Padding)
def collate_fn(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        text_list.append(_text)
        label_list.append(_label)

    # Pad Text với 0 (<PAD>)
    text_padded = pad_sequence(text_list, batch_first=True, padding_value=vocab['<PAD>'])

    label_padded = pad_sequence(label_list, batch_first=True, padding_value=-100)

    return text_padded, label_padded

# Tạo DataLoader
BATCH_SIZE = 32
train_loader = DataLoader(NERDataset(train_df, vocab, tag_map), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(NERDataset(val_df, vocab, tag_map), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(NERDataset(test_df, vocab, tag_map), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("Đã tạo xong DataLoader!")

In [None]:
import torch.nn as nn
import math

class TransformerTokenClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, n_layers, n_classes, max_len=500, dropout=0.1):
        super(TransformerTokenClassifier, self).__init__()
        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)

        # Stack các Encoder Layer
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_head, d_ff=d_model*4, dropout=dropout)
            for _ in range(n_layers)
        ])

        self.dropout = nn.Dropout(dropout)

        # Output Layer: Chiếu vector d_model về số lượng nhãn (n_classes) cho từng token
        self.fc_out = nn.Linear(d_model, n_classes)

    def forward(self, x, mask=None):
        # x: (Batch, Seq_Len)

        # 1. Embedding + Positional
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.dropout(x)

        # 2. Encoder Layers
        for layer in self.layers:
            x = layer(x, mask)

        # 3. Classifier
        # x shape: (Batch, Seq_Len, d_model) -> (Batch, Seq_Len, n_classes)
        output = self.fc_out(x)

        return output

In [None]:
import torch.optim as optim
from sklearn.metrics import classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

D_MODEL = 128
N_HEAD = 4
N_LAYERS = 3
NUM_TAGS = len(tag_map)
LR = 1e-4
EPOCHS = 30

# Khởi tạo mô hình
model = TransformerTokenClassifier(len(vocab), D_MODEL, N_HEAD, N_LAYERS, NUM_TAGS).to(device)

# Loss function bỏ qua padding (-100)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(model.parameters(), lr=LR)

print(f"Mô hình đang chạy trên: {device}")

# Hàm train
def train_ner(model, loader):
    model.train()
    total_loss = 0
    for text, labels in loader:
        text, labels = text.to(device), labels.to(device)

        # Tạo mask (Batch, 1, 1, Seq_Len)
        mask = (text != vocab['<PAD>']).unsqueeze(1).unsqueeze(2).to(device)

        optimizer.zero_grad()
        output = model(text, mask)
        # Output: (Batch, Seq, Tags), Label: (Batch, Seq)

        # Flatten để tính Loss
        loss = criterion(output.view(-1, NUM_TAGS), labels.view(-1))

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Hàm đánh giá
def evaluate_ner(model, loader):
    model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for text, labels in loader:
            text, labels = text.to(device), labels.to(device)
            mask = (text != vocab['<PAD>']).unsqueeze(1).unsqueeze(2).to(device)

            output = model(text, mask)
            preds = torch.argmax(output, dim=-1) # (Batch, Seq)

            # Lọc bỏ padding (-100) để tính toán chính xác
            for i in range(text.size(0)):
                # Lấy chiều dài thực của câu (không tính pad)
                valid_len = (labels[i] != -100).sum()

                # Chỉ lấy phần nhãn thực
                p = preds[i, :valid_len].cpu().numpy()
                t = labels[i, :valid_len].cpu().numpy()

                pred_labels.extend(p)
                true_labels.extend(t)

    return true_labels, pred_labels

for epoch in range(EPOCHS):
    loss = train_ner(model, train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {loss:.4f}")

# Đánh giá cuối cùng trên tập Test
print("\n--- KẾT QUẢ TRÊN TẬP TEST ---")
true_tags, pred_tags = evaluate_ner(model, test_loader)

inv_tag_map = {v: k for k, v in tag_map.items()}
true_tag_names = [inv_tag_map[i] for i in true_tags]
pred_tag_names = [inv_tag_map[i] for i in pred_tags]

print(classification_report(true_tag_names, pred_tag_names))