In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel
from datasets import load_dataset

from tqdm import tqdm # Th∆∞ vi·ªán h·ªØu √≠ch ƒë·ªÉ xem thanh ti·∫øn tr√¨nh
import warnings     # ƒê·ªÉ t·∫Øt c√°c c·∫£nh b√°o kh√¥ng c·∫ßn thi·∫øt
import os

# C√†i ƒë·∫∑t
warnings.filterwarnings('ignore') # B·ªè qua c√°c c·∫£nh b√°o
os.environ["TOKENIZERS_PARALLELISM"] = "false" # T·∫Øt c·∫£nh b√°o c·ªßa tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# config.py
BERT_MODEL_NAME = 'bert-base-uncased'
DATASET_NAME = 'SetFit/sst5'

# Quy·∫øt ƒë·ªãnh t·ª´ file 1.0
MAX_LENGTH = 64 

# C·∫•u h√¨nh th·ª≠ nghi·ªám
BATCH_SIZE = 8 # Ch·ªâ c·∫ßn 8 m·∫´u ƒë·ªÉ th·ª≠
LEARNING_RATE = 2e-5

# C·∫•u h√¨nh m√¥ h√¨nh
LSTM_HIDDEN_SIZE = 256 # K√≠ch th∆∞·ªõc l·ªõp ·∫©n LSTM
LSTM_LAYERS = 2      # S·ªë l·ªõp BiLSTM (stacked)
DROPOUT_RATE = 0.3
NUM_CLASSES = 5      # 5 l·ªõp sentiment

# Thi·∫øt b·ªã (r·∫•t quan tr·ªçng)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

Using device: cpu


In [3]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

# Test th·ª≠ tokenizer
test_text = "This movie is not bad, it's actually great!"
encoding = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,    # Th√™m [CLS] v√† [SEP]
    max_length=MAX_LENGTH,      # Pad ho·∫∑c Truncate
    padding='max_length',       # Pad t·ªõi max_length
    truncation=True,            # C·∫Øt n·∫øu d√†i h∆°n max_length
    return_tensors='pt'         # Tr·∫£ v·ªÅ PyTorch tensor
)

print("--- Test Tokenizer ---")
print("Input IDs shape:", encoding['input_ids'].shape)
print("Attention Mask shape:", encoding['attention_mask'].shape)
print("-" * 30)

--- Test Tokenizer ---
Input IDs shape: torch.Size([1, 64])
Attention Mask shape: torch.Size([1, 64])
------------------------------


In [4]:
class SST5PrototypeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # L·∫•y m·∫´u d·ªØ li·ªáu
        sample = self.data[idx]
        text = sample['text']
        label = sample['label']
        
        # Tokenize vƒÉn b·∫£n
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # .squeeze() ƒë·ªÉ lo·∫°i b·ªè chi·ªÅu batch (v√¨ tokenizer tr·∫£ v·ªÅ [1, max_length])
        # .to(torch.long) l√† c·∫ßn thi·∫øt cho label
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [5]:
class BertLSTMClassifier(nn.Module):
    def __init__(self):
        super(BertLSTMClassifier, self).__init__()
        
        # --- 1. L·ªõp BERT (Feature Extractor) ---
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
        # **ƒê√ìNG BƒÇNG BERT**
        # kh√¥ng train l·∫°i BERT, ch·ªâ d√πng n√≥ ƒë·ªÉ tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng
        for param in self.bert.parameters():
            param.requires_grad = False
            
        bert_output_size = self.bert.config.hidden_size # (ƒê√¢y l√† 768)
        
        # --- 2. L·ªõp Stacked BiLSTM (Encoder) ---
        self.lstm = nn.LSTM(
            input_size=bert_output_size,         # ƒê·∫ßu v√†o l√† 768
            hidden_size=LSTM_HIDDEN_SIZE,  # 256
            num_layers=LSTM_LAYERS,        # 2
            bidirectional=True,                # BiLSTM
            batch_first=True,                  # R·∫•t quan tr·ªçng! [Batch, Seq, Feature]
            dropout=DROPOUT_RATE if LSTM_LAYERS > 1 else 0
        )
        
        lstm_output_size = LSTM_HIDDEN_SIZE * 2 # (256 * 2 = 512, v√¨ l√† BiLSTM)
        
        # --- 3. L·ªõp Attention ---
        # Ch√∫ng ta s·∫Ω h·ªçc m·ªôt "tr·ªçng s·ªë" cho m·ªói hidden state c·ªßa LSTM
        # (c∆° ch·∫ø Attention ƒë∆°n gi·∫£n v√† ph·ªï bi·∫øn)
        # W*h + b
        self.attention_weights = nn.Linear(lstm_output_size, 1)
        
        # --- 4. L·ªõp Ph√¢n lo·∫°i (Classifier) ---
        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.classifier = nn.Linear(
            lstm_output_size, # 512
            NUM_CLASSES       # 5
        )

    def forward(self, input_ids, attention_mask):
        # 1. BERT: [B, S] -> [B, S, 768] (B=Batch Size, S=Seq Length)
        # last_hidden_state ch√≠nh l√† features
        bert_output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask
        )
        bert_features = bert_output.last_hidden_state
        
        # 2. BiLSTM: [B, S, 768] -> [B, S, 512]
        # lstm_output ch·ª©a hidden state c·ªßa *t·∫•t c·∫£* c√°c time step
        lstm_output, (h_n, c_n) = self.lstm(bert_features)
        
        # 3. Attention: [B, S, 512] -> [B, 512]
        # T√≠nh "ƒëi·ªÉm s·ªë" (logits) cho m·ªói t·ª´
        # (B, S, 512) -> (B, S, 1)
        attn_logits = self.attention_weights(lstm_output)
        
        # Ch√∫ng ta kh√¥ng mu·ªën Attention ch√∫ √Ω v√†o c√°c token [PAD]
        # T·∫°o m·ªôt mask ng∆∞·ª£c l·∫°i t·ª´ attention_mask c·ªßa BERT
        # (B, S) -> (B, S, 1)
        attn_mask = attention_mask.unsqueeze(2)
        # Fill c√°c v·ªã tr√≠ [PAD] (mask=0) b·∫±ng -infinity ƒë·ªÉ softmax = 0
        attn_logits = attn_logits.masked_fill(attn_mask == 0, -1e9)
        
        # √Åp d·ª•ng softmax ƒë·ªÉ ra tr·ªçng s·ªë
        # (B, S, 1)
        attn_scores = F.softmax(attn_logits, dim=1) 
        
        # T√≠nh "vector ng·ªØ c·∫£nh" (weighted sum)
        # (B, S, 512) * (B, S, 1) -> (B, S, 512)
        # .sum(dim=1) -> (B, 512)
        context_vector = torch.sum(lstm_output * attn_scores, dim=1)
        
        # 4. Classifier: [B, 512] -> [B, 5]
        context_vector = self.dropout(context_vector)
        logits = self.classifier(context_vector)
        
        return logits

In [6]:
print("--- B·∫ÆT ƒê·∫¶U KI·ªÇM TH·ª¨ PIPELINE ---")

try:
    # 1. T·∫£i m·ªôt ph·∫ßn nh·ªè c·ªßa d·ªØ li·ªáu
    print(f"[1/8] ƒêang t·∫£i {BATCH_SIZE * 2} m·∫´u t·ª´ '{DATASET_NAME}'...")
    raw_datasets = load_dataset(DATASET_NAME)
    small_train_data = raw_datasets['train'].select(range(BATCH_SIZE * 2))
    print("     T·∫£i xong!")

    # 2. T·∫°o Dataset
    print("[2/8] ƒêang t·∫°o Prototype Dataset...")
    train_dataset = SST5PrototypeDataset(
        data=small_train_data,
        tokenizer=tokenizer,
        max_length=MAX_LENGTH
    )

    # 3. T·∫°o DataLoader
    print("[3/8] ƒêang t·∫°o DataLoader...")
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True
    )

    # 4. L·∫•y M·ªòT BATCH d·ªØ li·ªáu
    print("[4/8] ƒêang l·∫•y 1 batch d·ªØ li·ªáu...")
    batch = next(iter(train_loader))
    
    # Chuy·ªÉn batch l√™n DEVICE
    input_ids = batch['input_ids'].to(DEVICE)
    attention_mask = batch['attention_mask'].to(DEVICE)
    labels = batch['label'].to(DEVICE)
    
    print(f"     Input IDs shape: {input_ids.shape}")
    print(f"     Attention Mask shape: {attention_mask.shape}")
    print(f"     Labels shape: {labels.shape}")
    
    assert input_ids.shape == (BATCH_SIZE, MAX_LENGTH)

    # 5. Kh·ªüi t·∫°o M√¥ h√¨nh (d√πng class ·ªü Cell 5)
    print("[5/8] ƒêang kh·ªüi t·∫°o m√¥ h√¨nh BertLSTMClassifier...")
    model = BertLSTMClassifier().to(DEVICE)
    # print(model) # B·ªè comment n·∫øu xem ki·∫øn tr√∫c

    # 6. Kh·ªüi t·∫°o Optimizer v√† Loss (d√πng nn v√† torch.optim t·ª´ Cell 1)
    print("[6/8] ƒêang kh·ªüi t·∫°o Optimizer v√† Loss Function...")
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    # 7. CH·∫†Y FORWARD PASS
    print("[7/8] ƒêang th·ª±c hi·ªán Forward Pass (model(inputs))...")
    optimizer.zero_grad()
    logits = model(input_ids, attention_mask)
    
    print(f"     Output Logits shape: {logits.shape}")
    assert logits.shape == (BATCH_SIZE, NUM_CLASSES)
    print("     Forward Pass TH√ÄNH C√îNG!")

    # 8. T√çNH LOSS V√Ä BACKWARD PASS
    print("[8/8] ƒêang th·ª±c hi·ªán Backward Pass (loss.backward())...")
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()
    
    print(f"     T√≠nh ƒë∆∞·ª£c Loss: {loss.item():.4f}")
    print("     Backward Pass TH√ÄNH C√îNG!")
    
    print("\n--- KI·ªÇM TH·ª¨ HO√ÄN T·∫§T --- üéâ")
    print("To√†n b·ªô pipeline ho·∫°t ƒë·ªông t·ªët. K√≠ch th∆∞·ªõc tensor ch√≠nh x√°c.")
    print("S·∫µn s√†ng ƒë·ªÉ chuy·ªÉn code n√†y sang c√°c file .py trong 'src/'!")

except Exception as e:
    print(f"\n--- !!! G·∫∂P L·ªñI !!! ---")
    print(f"L·ªói: {e}")
    import traceback
    traceback.print_exc()

--- B·∫ÆT ƒê·∫¶U KI·ªÇM TH·ª¨ PIPELINE ---
[1/8] ƒêang t·∫£i 16 m·∫´u t·ª´ 'SetFit/sst5'...


Repo card metadata block was not found. Setting CardData to empty.


     T·∫£i xong!
[2/8] ƒêang t·∫°o Prototype Dataset...
[3/8] ƒêang t·∫°o DataLoader...
[4/8] ƒêang l·∫•y 1 batch d·ªØ li·ªáu...
     Input IDs shape: torch.Size([8, 64])
     Attention Mask shape: torch.Size([8, 64])
     Labels shape: torch.Size([8])
[5/8] ƒêang kh·ªüi t·∫°o m√¥ h√¨nh BertLSTMClassifier...
[6/8] ƒêang kh·ªüi t·∫°o Optimizer v√† Loss Function...
[7/8] ƒêang th·ª±c hi·ªán Forward Pass (model(inputs))...
     Output Logits shape: torch.Size([8, 5])
     Forward Pass TH√ÄNH C√îNG!
[8/8] ƒêang th·ª±c hi·ªán Backward Pass (loss.backward())...
     T√≠nh ƒë∆∞·ª£c Loss: 1.6324
     Backward Pass TH√ÄNH C√îNG!

--- KI·ªÇM TH·ª¨ HO√ÄN T·∫§T --- üéâ
To√†n b·ªô pipeline ho·∫°t ƒë·ªông t·ªët. K√≠ch th∆∞·ªõc tensor ch√≠nh x√°c.
S·∫µn s√†ng ƒë·ªÉ chuy·ªÉn code n√†y sang c√°c file .py trong 'src/'!
