In [None]:
# Detect environment
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Install required packages
    !pip install underthesea pyvi -q
    
    # Set paths for Colab
    BASE_PATH = '/content/drive/MyDrive/learniverse-ai'
else:
    # Local paths
    BASE_PATH = '..'  # Assuming notebook is in notebooks/

print(f"Running in: {'Google Colab' if IN_COLAB else 'Local Environment'}")
print(f"Base path: {BASE_PATH}")

In [None]:
# Import libraries
import os
import sys
import json
import pickle
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    f1_score, 
    precision_score, 
    recall_score,
    confusion_matrix,
    multilabel_confusion_matrix
)

# Add src to path
sys.path.insert(0, os.path.join(BASE_PATH, 'src'))

from preprocessing.text_preprocessor import VietnameseTextPreprocessor
from models.bilstm import BiLSTMClassifier, MultiLabelFocalLoss, get_model_config
from models.dataset import Vocabulary, CommentDataset, collate_fn

# Set seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

## 1. Load and Explore Data

### Expected Data Format

**ViHSD:**
- Columns: `free_text`, `label_id` (0=CLEAN, 1=OFFENSIVE, 2=HATE)

**ViCTSD:**
- Columns: `Comment`, `Constructive`, `Toxic`

In [None]:
# Define paths
DATA_RAW_PATH = Path(BASE_PATH) / 'data' / 'raw'
DATA_PROCESSED_PATH = Path(BASE_PATH) / 'data' / 'processed'
MODELS_PATH = Path(BASE_PATH) / 'data' / 'models'

# Create directories if needed
DATA_PROCESSED_PATH.mkdir(parents=True, exist_ok=True)
MODELS_PATH.mkdir(parents=True, exist_ok=True)

print(f"Raw data path: {DATA_RAW_PATH}")
print(f"Processed data path: {DATA_PROCESSED_PATH}")
print(f"Models path: {MODELS_PATH}")

In [None]:
# List available files in raw data folder
print("Files in raw data folder:")
if DATA_RAW_PATH.exists():
    for f in DATA_RAW_PATH.iterdir():
        print(f"  - {f.name}")
else:
    print("  Raw data folder not found!")
    print(f"  Please create: {DATA_RAW_PATH}")

In [None]:
def load_vihsd(data_path: Path) -> pd.DataFrame:
    """
    Load ViHSD dataset.
    Expected files: vihsd_train.csv, vihsd_dev.csv, vihsd_test.csv
    """
    dfs = []
    
    for split in ['train', 'dev', 'test']:
        file_path = data_path / f'vihsd_{split}.csv'
        if file_path.exists():
            df = pd.read_csv(file_path)
            df['split'] = split
            dfs.append(df)
            print(f"Loaded {split}: {len(df)} samples")
        else:
            print(f"Warning: {file_path} not found")
    
    if not dfs:
        return None
    
    combined = pd.concat(dfs, ignore_index=True)
    
    # Rename columns to standard format
    combined = combined.rename(columns={
        'free_text': 'text',
        'label_id': 'label'
    })
    
    # Map labels: 0=CLEAN, 1=OFFENSIVE, 2=HATE
    # Our labels: toxic_offensive (1 or 2), hate_speech (2 only)
    combined['toxic_offensive'] = (combined['label'] >= 1).astype(int)
    combined['hate_speech'] = (combined['label'] == 2).astype(int)
    combined['source'] = 'vihsd'
    
    return combined


def load_victsd(data_path: Path) -> pd.DataFrame:
    """
    Load UIT-ViCTSD dataset.
    Expected files: victsd_train.csv, victsd_dev.csv, victsd_test.csv
    """
    dfs = []
    
    for split in ['train', 'dev', 'test']:
        file_path = data_path / f'victsd_{split}.csv'
        if file_path.exists():
            df = pd.read_csv(file_path)
            df['split'] = split
            dfs.append(df)
            print(f"Loaded {split}: {len(df)} samples")
        else:
            print(f"Warning: {file_path} not found")
    
    if not dfs:
        return None
    
    combined = pd.concat(dfs, ignore_index=True)
    
    # Rename columns to standard format
    combined = combined.rename(columns={
        'Comment': 'text',
        'Toxic': 'toxic_offensive'
    })
    
    # ViCTSD doesn't have hate_speech, set to 0
    combined['hate_speech'] = 0
    combined['source'] = 'victsd'
    
    return combined

In [None]:
# Load datasets
print("Loading ViHSD...")
vihsd_df = load_vihsd(DATA_RAW_PATH)

print("\nLoading ViCTSD...")
victsd_df = load_victsd(DATA_RAW_PATH)

# Combine datasets
dfs_to_combine = []
if vihsd_df is not None:
    dfs_to_combine.append(vihsd_df[['text', 'toxic_offensive', 'hate_speech', 'split', 'source']])
if victsd_df is not None:
    dfs_to_combine.append(victsd_df[['text', 'toxic_offensive', 'hate_speech', 'split', 'source']])

if dfs_to_combine:
    df = pd.concat(dfs_to_combine, ignore_index=True)
    print(f"\nCombined dataset: {len(df)} samples")
else:
    print("\n⚠️ No datasets found! Please add data files to data/raw/")
    print("Expected files:")
    print("  - vihsd_train.csv, vihsd_dev.csv, vihsd_test.csv")
    print("  - victsd_train.csv, victsd_dev.csv, victsd_test.csv")
    df = None

In [None]:
# Explore data
if df is not None:
    print("Dataset Info:")
    print(df.info())
    
    print("\nSample data:")
    display(df.head(10))
    
    print("\nLabel distribution:")
    print(f"toxic_offensive: {df['toxic_offensive'].value_counts().to_dict()}")
    print(f"hate_speech: {df['hate_speech'].value_counts().to_dict()}")
    
    print("\nBy source:")
    print(df.groupby('source')[['toxic_offensive', 'hate_speech']].sum())

In [None]:
# Visualize label distribution
if df is not None:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Toxic/Offensive distribution
    ax1 = axes[0]
    df['toxic_offensive'].value_counts().plot(kind='bar', ax=ax1, color=['green', 'red'])
    ax1.set_title('Toxic/Offensive Distribution')
    ax1.set_xlabel('Label')
    ax1.set_ylabel('Count')
    ax1.set_xticklabels(['Clean (0)', 'Toxic/Offensive (1)'], rotation=0)
    
    # Hate speech distribution
    ax2 = axes[1]
    df['hate_speech'].value_counts().plot(kind='bar', ax=ax2, color=['green', 'red'])
    ax2.set_title('Hate Speech Distribution')
    ax2.set_xlabel('Label')
    ax2.set_ylabel('Count')
    ax2.set_xticklabels(['No (0)', 'Yes (1)'], rotation=0)
    
    # Source distribution
    ax3 = axes[2]
    df['source'].value_counts().plot(kind='bar', ax=ax3)
    ax3.set_title('Data Source Distribution')
    ax3.set_xlabel('Source')
    ax3.set_ylabel('Count')
    ax3.set_xticklabels(ax3.get_xticklabels(), rotation=0)
    
    plt.tight_layout()
    plt.show()

## 2. Preprocess Data

In [None]:
# Initialize preprocessor
preprocessor = VietnameseTextPreprocessor(
    lowercase=True,
    remove_urls=True,
    remove_emails=True,
    remove_phones=True,
    remove_emojis=False,  # Keep emojis - they can indicate sentiment
    normalize_teencode=True,
    normalize_repeated_chars=True,
    word_segmentation=True,  # Vietnamese word segmentation
)

print("Preprocessor initialized")

In [None]:
# Apply preprocessing
if df is not None:
    print("Preprocessing texts...")
    tqdm.pandas(desc="Preprocessing")
    
    df['text_processed'] = df['text'].progress_apply(preprocessor.preprocess)
    
    # Show examples
    print("\nPreprocessing examples:")
    for i in range(min(5, len(df))):
        print(f"\nOriginal:  {df.iloc[i]['text'][:100]}...")
        print(f"Processed: {df.iloc[i]['text_processed'][:100]}...")

In [None]:
# Remove empty texts after preprocessing
if df is not None:
    original_len = len(df)
    df = df[df['text_processed'].str.len() > 0].reset_index(drop=True)
    print(f"Removed {original_len - len(df)} empty texts")
    print(f"Final dataset size: {len(df)}")

In [None]:
# Analyze text lengths
if df is not None:
    df['text_length'] = df['text_processed'].str.split().str.len()
    
    print("Text length statistics:")
    print(df['text_length'].describe())
    
    # Plot distribution
    fig, ax = plt.subplots(figsize=(10, 4))
    df['text_length'].hist(bins=50, ax=ax)
    ax.axvline(x=128, color='r', linestyle='--', label='Max length (128)')
    ax.set_xlabel('Text Length (tokens)')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of Text Lengths')
    ax.legend()
    plt.show()
    
    # Percentage within max length
    max_len = 128
    pct_within = (df['text_length'] <= max_len).mean() * 100
    print(f"\n{pct_within:.1f}% of texts are within {max_len} tokens")

## 3. Prepare Train/Val/Test Splits

In [None]:
# Split data
if df is not None:
    # Use original splits if available, otherwise create new splits
    if 'split' in df.columns and df['split'].nunique() > 1:
        train_df = df[df['split'] == 'train'].reset_index(drop=True)
        val_df = df[df['split'] == 'dev'].reset_index(drop=True)
        test_df = df[df['split'] == 'test'].reset_index(drop=True)
        print("Using original splits")
    else:
        # Create new splits: 80% train, 10% val, 10% test
        train_df, temp_df = train_test_split(
            df, test_size=0.2, random_state=SEED, 
            stratify=df['toxic_offensive']  # Stratify by main label
        )
        val_df, test_df = train_test_split(
            temp_df, test_size=0.5, random_state=SEED
        )
        train_df = train_df.reset_index(drop=True)
        val_df = val_df.reset_index(drop=True)
        test_df = test_df.reset_index(drop=True)
        print("Created new splits")
    
    print(f"\nTrain: {len(train_df)} samples")
    print(f"Val:   {len(val_df)} samples")
    print(f"Test:  {len(test_df)} samples")
    
    # Show label distribution in each split
    print("\nLabel distribution:")
    for name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
        toxic_pct = split_df['toxic_offensive'].mean() * 100
        hate_pct = split_df['hate_speech'].mean() * 100
        print(f"  {name}: toxic_offensive={toxic_pct:.1f}%, hate_speech={hate_pct:.1f}%")

## 4. Build Vocabulary and Create Datasets

In [None]:
# Configuration
MAX_VOCAB_SIZE = 30000
MIN_FREQ = 2
MAX_SEQ_LENGTH = 128
BATCH_SIZE = 32

print(f"Max vocabulary size: {MAX_VOCAB_SIZE}")
print(f"Min token frequency: {MIN_FREQ}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")

In [None]:
# Build vocabulary from training data only
if train_df is not None:
    print("Building vocabulary...")
    vocab = Vocabulary(max_size=MAX_VOCAB_SIZE, min_freq=MIN_FREQ)
    vocab.build(train_df['text_processed'].tolist())
    
    print(f"Vocabulary size: {len(vocab)}")
    
    # Save vocabulary
    vocab_path = MODELS_PATH / 'vocab.json'
    vocab.save(vocab_path)
    print(f"Vocabulary saved to: {vocab_path}")

In [None]:
# Create datasets
LABEL_COLUMNS = ['toxic_offensive', 'hate_speech']

if train_df is not None:
    train_dataset = CommentDataset(
        texts=train_df['text_processed'].tolist(),
        labels=train_df[LABEL_COLUMNS].values.tolist(),
        vocab=vocab,
        max_length=MAX_SEQ_LENGTH,
    )
    
    val_dataset = CommentDataset(
        texts=val_df['text_processed'].tolist(),
        labels=val_df[LABEL_COLUMNS].values.tolist(),
        vocab=vocab,
        max_length=MAX_SEQ_LENGTH,
    )
    
    test_dataset = CommentDataset(
        texts=test_df['text_processed'].tolist(),
        labels=test_df[LABEL_COLUMNS].values.tolist(),
        vocab=vocab,
        max_length=MAX_SEQ_LENGTH,
    )
    
    print(f"Train dataset: {len(train_dataset)} samples")
    print(f"Val dataset: {len(val_dataset)} samples")
    print(f"Test dataset: {len(test_dataset)} samples")

In [None]:
# Create dataloaders
if train_df is not None:
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2 if not IN_COLAB else 0,
        pin_memory=True,
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2 if not IN_COLAB else 0,
        pin_memory=True,
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2 if not IN_COLAB else 0,
        pin_memory=True,
    )
    
    print(f"Train batches: {len(train_loader)}")
    print(f"Val batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")

## 5. Initialize Model

In [None]:
# Model configuration
model_config = get_model_config('base')  # 'small', 'base', or 'large'
print("Model configuration:")
for k, v in model_config.items():
    print(f"  {k}: {v}")

In [None]:
# Create model
if vocab is not None:
    model = BiLSTMClassifier(
        vocab_size=len(vocab),
        num_labels=len(LABEL_COLUMNS),
        padding_idx=vocab.pad_idx,
        **model_config
    )
    
    model = model.to(device)
    
    # Print model summary
    print(model)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nTotal parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Estimated size: {total_params * 4 / 1024**2:.1f} MB")

In [None]:
# Loss function and optimizer
# Using Focal Loss to handle class imbalance
criterion = MultiLabelFocalLoss(alpha=0.25, gamma=2.0)

# Optimizer with weight decay
optimizer = optim.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=0.01
)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',  # Maximize F1 score
    factor=0.5,
    patience=2,
    verbose=True
)

print("Criterion: MultiLabelFocalLoss")
print("Optimizer: AdamW")
print("Scheduler: ReduceLROnPlateau")

## 6. Training

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    pbar = tqdm(dataloader, desc='Training')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs['logits'], labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        
        preds = (outputs['probabilities'] > 0.5).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
        
        pbar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(dataloader)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    # Calculate metrics
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    
    return avg_loss, f1_micro, f1_macro


def evaluate(model, dataloader, criterion, device):
    """Evaluate the model."""
    model.eval()
    total_loss = 0
    all_preds = []
    all_probs = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs['logits'], labels)
            
            total_loss += loss.item()
            
            probs = outputs['probabilities'].cpu().numpy()
            preds = (probs > 0.5).astype(int)
            
            all_probs.extend(probs)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    all_preds = np.array(all_preds)
    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)
    
    # Calculate metrics
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    
    return avg_loss, f1_micro, f1_macro, all_preds, all_probs, all_labels

In [None]:
# Training loop
NUM_EPOCHS = 10
PATIENCE = 3  # Early stopping patience

best_val_f1 = 0
patience_counter = 0
history = {
    'train_loss': [], 'train_f1_micro': [], 'train_f1_macro': [],
    'val_loss': [], 'val_f1_micro': [], 'val_f1_macro': []
}

print(f"Starting training for {NUM_EPOCHS} epochs...")
print(f"Early stopping patience: {PATIENCE}")
print("=" * 60)

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    print("-" * 40)
    
    # Train
    train_loss, train_f1_micro, train_f1_macro = train_epoch(
        model, train_loader, criterion, optimizer, device
    )
    
    # Validate
    val_loss, val_f1_micro, val_f1_macro, _, _, _ = evaluate(
        model, val_loader, criterion, device
    )
    
    # Update scheduler
    scheduler.step(val_f1_macro)
    
    # Log metrics
    history['train_loss'].append(train_loss)
    history['train_f1_micro'].append(train_f1_micro)
    history['train_f1_macro'].append(train_f1_macro)
    history['val_loss'].append(val_loss)
    history['val_f1_micro'].append(val_f1_micro)
    history['val_f1_macro'].append(val_f1_macro)
    
    print(f"Train - Loss: {train_loss:.4f}, F1-micro: {train_f1_micro:.4f}, F1-macro: {train_f1_macro:.4f}")
    print(f"Val   - Loss: {val_loss:.4f}, F1-micro: {val_f1_micro:.4f}, F1-macro: {val_f1_macro:.4f}")
    
    # Save best model
    if val_f1_macro > best_val_f1:
        best_val_f1 = val_f1_macro
        patience_counter = 0
        
        # Save model
        model_path = MODELS_PATH / 'bilstm_best.pt'
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_f1_macro': val_f1_macro,
            'model_config': model_config,
            'vocab_size': len(vocab),
        }, model_path)
        print(f"✓ Saved best model (F1-macro: {val_f1_macro:.4f})")
    else:
        patience_counter += 1
        print(f"No improvement. Patience: {patience_counter}/{PATIENCE}")
    
    # Early stopping
    if patience_counter >= PATIENCE:
        print(f"\nEarly stopping at epoch {epoch + 1}")
        break

print("\n" + "=" * 60)
print(f"Training completed! Best Val F1-macro: {best_val_f1:.4f}")

In [None]:
# Plot training history
if history['train_loss']:
    fig, axes = plt.subplots(1, 2, figsize=(14, 4))
    
    # Loss
    ax1 = axes[0]
    ax1.plot(history['train_loss'], label='Train')
    ax1.plot(history['val_loss'], label='Val')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.set_title('Training and Validation Loss')
    ax1.legend()
    ax1.grid(True)
    
    # F1 Score
    ax2 = axes[1]
    ax2.plot(history['train_f1_macro'], label='Train F1-macro')
    ax2.plot(history['val_f1_macro'], label='Val F1-macro')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('F1 Score')
    ax2.set_title('Training and Validation F1-macro')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()

## 7. Evaluate on Test Set

In [None]:
# Load best model
model_path = MODELS_PATH / 'bilstm_best.pt'
if model_path.exists():
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded best model from epoch {checkpoint['epoch']+1}")
    print(f"Val F1-macro: {checkpoint['val_f1_macro']:.4f}")

In [None]:
# Evaluate on test set
test_loss, test_f1_micro, test_f1_macro, test_preds, test_probs, test_labels = evaluate(
    model, test_loader, criterion, device
)

print("\n" + "=" * 60)
print("TEST SET RESULTS")
print("=" * 60)
print(f"Loss: {test_loss:.4f}")
print(f"F1-micro: {test_f1_micro:.4f}")
print(f"F1-macro: {test_f1_macro:.4f}")

In [None]:
# Detailed classification report for each label
print("\nDetailed Classification Report:")
print("=" * 60)

for i, label_name in enumerate(LABEL_COLUMNS):
    print(f"\n{label_name.upper()}:")
    print(classification_report(
        test_labels[:, i], 
        test_preds[:, i],
        target_names=['Negative', 'Positive']
    ))

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

for i, (label_name, ax) in enumerate(zip(LABEL_COLUMNS, axes)):
    cm = confusion_matrix(test_labels[:, i], test_preds[:, i])
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues', ax=ax,
        xticklabels=['Predicted 0', 'Predicted 1'],
        yticklabels=['Actual 0', 'Actual 1']
    )
    ax.set_title(f'Confusion Matrix: {label_name}')

plt.tight_layout()
plt.show()

## 8. Save Model for Deployment

In [None]:
# Save complete model package for deployment
deployment_path = MODELS_PATH / 'deployment'
deployment_path.mkdir(parents=True, exist_ok=True)

# Save model weights
torch.save(model.state_dict(), deployment_path / 'model_weights.pt')

# Save model config
config = {
    'model_type': 'BiLSTMClassifier',
    'vocab_size': len(vocab),
    'num_labels': len(LABEL_COLUMNS),
    'label_names': LABEL_COLUMNS,
    'max_seq_length': MAX_SEQ_LENGTH,
    'padding_idx': vocab.pad_idx,
    **model_config
}

with open(deployment_path / 'config.json', 'w') as f:
    json.dump(config, f, indent=2)

# Copy vocabulary
vocab.save(deployment_path / 'vocab.json')

print("Saved deployment package:")
for f in deployment_path.iterdir():
    size = f.stat().st_size / 1024  # KB
    print(f"  - {f.name}: {size:.1f} KB")

In [None]:
# Test inference
print("Testing inference...")

test_comments = [
    "Sản phẩm này tốt lắm, mình rất thích!",
    "Đm thằng này ngu vl",
    "Mấy thằng người Bắc toàn lừa đảo",
    "Cảm ơn bạn đã chia sẻ thông tin hữu ích",
]

model.eval()
for comment in test_comments:
    # Preprocess
    processed = preprocessor.preprocess(comment)
    
    # Tokenize
    tokens = vocab.encode(processed)
    if len(tokens) > MAX_SEQ_LENGTH:
        tokens = tokens[:MAX_SEQ_LENGTH]
    
    # Pad
    attention_mask = [1] * len(tokens)
    padding = MAX_SEQ_LENGTH - len(tokens)
    tokens = tokens + [vocab.pad_idx] * padding
    attention_mask = attention_mask + [0] * padding
    
    # Predict
    input_ids = torch.tensor([tokens], dtype=torch.long).to(device)
    mask = torch.tensor([attention_mask], dtype=torch.long).to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, mask)
        probs = outputs['probabilities'][0].cpu().numpy()
    
    print(f"\nComment: {comment}")
    print(f"  toxic_offensive: {probs[0]:.3f}")
    print(f"  hate_speech: {probs[1]:.3f}")

## Done!

The trained model is saved in `data/models/deployment/`. Files:
- `model_weights.pt` - Model weights
- `config.json` - Model configuration
- `vocab.json` - Vocabulary

Next steps:
1. Download these files if running on Colab
2. Place them in your local `data/models/deployment/` folder
3. Run the FastAPI service