In [2]:
import torch
import pandas as pd
import numpy as np
import random
from torch.utils.data import Dataset
from transformers import AutoTokenizer

from model.bi_lstm import BiLSTMAttentionABSA
from model.pho_bert import PhoBERT_ABSA

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [3]:
class ABSATestDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_len=128):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.aspects = ['Price', 'Shipping', 'Outlook', 'Quality', 'Size', 'Shop_Service', 'General', 'Others']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        cleaned_review = str(self.data.iloc[index]['Cleaned_Review'])
        
        if 'Review' in self.data.columns:
            original_review = str(self.data.iloc[index]['Review'])
        else:
            original_review = cleaned_review
            
        labels = []
        for aspect in self.aspects:
            label = self.data.iloc[index][aspect]
            labels.append(3 if label == -1 else label)
            
        encoding = self.tokenizer(
            cleaned_review,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'original_review': original_review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [4]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

test_dataset = ABSATestDataset('datasets_cleaned/test_data.csv', tokenizer)

print("ƒêang n·∫°p tr·ªçng s·ªë BiLSTM-Attention...")
bilstm_model = BiLSTMAttentionABSA(vocab_size=64001, embedding_dim=256, hidden_dim=256).to(device)
bilstm_model.load_state_dict(torch.load('saved_models/bilstm_absa_weights.pth', map_location=device))
bilstm_model.eval()
print("\nSuccess!")

print("ƒêang n·∫°p tr·ªçng s·ªë PhoBERT...")
phobert_model = PhoBERT_ABSA().to(device)
phobert_model.load_state_dict(torch.load('saved_models/phobert_absa_weights.pth', map_location=device))
phobert_model.eval()
print("\nSuccess!")



ƒêang n·∫°p tr·ªçng s·ªë BiLSTM-Attention...

Success!
ƒêang n·∫°p tr·ªçng s·ªë PhoBERT...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 627.76it/s, Materializing param=pooler.dense.weight]                               
[1mRobertaModel LOAD REPORT[0m from: vinai/phobert-base
Key                             | Status     |  | 
--------------------------------+------------+--+-
lm_head.dense.weight            | UNEXPECTED |  | 
lm_head.layer_norm.weight       | UNEXPECTED |  | 
lm_head.bias                    | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 
lm_head.layer_norm.bias         | UNEXPECTED |  | 
lm_head.dense.bias              | UNEXPECTED |  | 
lm_head.decoder.weight          | UNEXPECTED |  | 
lm_head.decoder.bias            | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Success!


In [5]:
aspects = ['Price', 'Shipping', 'Outlook', 'Quality', 'Size', 'Shop_Service', 'General', 'Others']
label_map = {0: 'Ti√™u c·ª±c üî¥', 1: 'T√≠ch c·ª±c üü¢', 2: 'Trung t√≠nh üü°', 3: 'Kh√¥ng ƒë·ªÅ c·∫≠p ‚ö™'}

def compare_on_test_sample(sample_idx):
    sample = test_dataset[sample_idx]
    review_text = sample['original_review']
    
    print(f"{'='*100}")
    print(f"üìù C√¢u g·ªëc [{sample_idx}]: '{review_text}'")
    print(f"{'-'*100}")
    
    input_ids = sample['input_ids'].unsqueeze(0).to(device)
    attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
    true_labels = sample['labels'].numpy()
    
    with torch.no_grad():
        bilstm_outputs = bilstm_model(input_ids)
        phobert_outputs = phobert_model(input_ids=input_ids, attention_mask=attention_mask)
        
    print(f"{'Kh√≠a c·∫°nh':<15} | {'Th·ª±c t·∫ø (Ground Truth)':<22} | {'BiLSTM D·ª± ƒëo√°n':<25} | {'PhoBERT D·ª± ƒëo√°n':<25}")
    print("-" * 100)
    
    found_any = False
    for i, aspect in enumerate(aspects):
        true_l = true_labels[i]
        bilstm_pred = torch.argmax(bilstm_outputs[i], dim=1).item()
        phobert_pred = torch.argmax(phobert_outputs[i], dim=1).item()
        
        if true_l != 3 or bilstm_pred != 3 or phobert_pred != 3:
            true_str = label_map[true_l]
            bilstm_str = label_map[bilstm_pred]
            phobert_str = label_map[phobert_pred]
            
            bilstm_mark = "‚úÖ" if bilstm_pred == true_l else "‚ùå"
            phobert_mark = "‚úÖ" if phobert_pred == true_l else "‚ùå"
            
            print(f"{aspect:<15} | {true_str:<22} | {bilstm_str} {bilstm_mark:<8} | {phobert_str} {phobert_mark}")
            found_any = True
            
    if not found_any:
        print("(C√¢u n√†y ho√†n to√†n kh√¥ng ch·ª©a ƒë√°nh gi√° khen/ch√™ n√†o)")
    print(f"{'='*100}\n")

In [7]:
num_samples = 20

random.seed(42) 
random_indices = random.sample(range(len(test_dataset)), num_samples)

print("üöÄ B·∫ÆT ƒê·∫¶U SO S√ÅNH HAI M√î H√åNH TR√äN T·∫¨P TEST TH·ª∞C T·∫æ:\n")
for idx in random_indices:
    compare_on_test_sample(idx)

üöÄ B·∫ÆT ƒê·∫¶U SO S√ÅNH HAI M√î H√åNH TR√äN T·∫¨P TEST TH·ª∞C T·∫æ:

üìù C√¢u g·ªëc [456]: 'Ngo√†i vi·ªác m√†u b·ªã lem ra ƒë·∫ø c√≤n l·∫°i th√¨ oke'
----------------------------------------------------------------------------------------------------
Kh√≠a c·∫°nh       | Th·ª±c t·∫ø (Ground Truth) | BiLSTM D·ª± ƒëo√°n            | PhoBERT D·ª± ƒëo√°n          
----------------------------------------------------------------------------------------------------
Outlook         | Ti√™u c·ª±c üî¥             | Ti√™u c·ª±c üî¥ ‚úÖ        | Ti√™u c·ª±c üî¥ ‚úÖ
General         | Kh√¥ng ƒë·ªÅ c·∫≠p ‚ö™         | Trung t√≠nh üü° ‚ùå        | Trung t√≠nh üü° ‚ùå

üìù C√¢u g·ªëc [102]: 'Gi√†y m·ªõi th√¨ mang v√†i th·∫•y h∆°i ch·∫≠t th√¥i mang nhi·ªÅu gi√†y s·∫Ω gi√£n ra theo ch√¢n l√† oke ko c·∫ßn mua size to h∆°n l√†m j'
----------------------------------------------------------------------------------------------------
Kh√≠a c·∫°nh       | Th·ª±c t·∫ø (Ground Truth) | BiLSTM D·ª± ƒëo√