In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np



In [11]:
MODEL_NAME = "AITeamVN/Vietnamese_Embedding"
MAX_LEN = 512  
BATCH_SIZE = 16 # Tùy chỉnh theo VRAM của bạn
EPOCHS = 10
LEARNING_RATE = 1e-4
NUM_CLASSES = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {DEVICE}")

Using device: cuda


In [12]:
df = pd.read_csv('data/data_news_crawl_30k.csv' , encoding="utf-8", sep="|") 
df['category'] = df['category'].str.strip()

In [13]:
label_encoder = LabelEncoder()
target_labels = ['giao-duc', 'the-thao', 'kinh-doanh', 'thoi-su', 'suc-khoe', 'giai-tri', 'the-gioi', 'doi-song']
label_encoder.fit(target_labels)

df['label'] = label_encoder.transform(df['category'])

In [14]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
class NewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = (df['title'] + " " + df['description']).tolist() # Gộp Title + Description
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [16]:
class NewsClassifier(nn.Module):
    def __init__(self, n_classes):
        super(NewsClassifier, self).__init__()
        # Load BGE-M3
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.bert_hidden_size = 1024 
        
        self.classifier = nn.Sequential(
            nn.Linear(self.bert_hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_classes)
        )

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        cls_embedding = outputs.last_hidden_state[:, 0, :] 
        
        # Đưa qua FCN
        output = self.classifier(cls_embedding)
        return output

In [21]:
from tqdm.auto import tqdm 

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples, epoch_index):
    model.train()
    losses = []
    correct_predictions = 0
    
    progress_bar = tqdm(data_loader, desc=f"Training Epoch {epoch_index}", leave=False)

    for d in progress_bar:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix(loss=loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [22]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    
    progress_bar = tqdm(data_loader, desc="Validating", leave=False)

    with torch.no_grad():
        for d in progress_bar:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [23]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = NewsDataset(train_df, tokenizer, MAX_LEN)
val_dataset = NewsDataset(val_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

model = NewsClassifier(n_classes=NUM_CLASSES)
model = model.to(DEVICE)

optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

In [24]:
print("Start Training...")

for epoch in range(EPOCHS):
    
    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, DEVICE, len(train_dataset), epoch + 1
    )
    
    val_acc, val_loss = eval_model(
        model, val_loader, loss_fn, DEVICE, len(val_dataset)
    )
    
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')
    print(f'Val loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}')
    print('-' * 20)

# Lưu model
torch.save(model.classifier.state_dict(), 'fcn_classifier_head.pth')
print("Model saved!")

Start Training...


Training Epoch 1:   0%|          | 0/1583 [00:00<?, ?it/s]

KeyboardInterrupt: 