In [43]:
import os
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from datasets import Dataset
from underthesea import word_tokenize
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
import string
import pandas as pd


In [44]:
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = get_device()
print(device)

cpu


In [45]:
import pandas as pd

file_path = 'data.csv'

df = pd.read_csv(file_path)
df = df[['content', 'label']]

labels_map = {
    "POS": 0,
    "NEU": 1,
    "NEG": 2
}

df['label'] = df['label'].map(labels_map)
df = df.dropna(subset=['content'])

In [46]:
def remove_emoji(text):
    for emoji in UNICODE_EMOJI.values():
        text = text.replace(emoji, "")
    for emoticon in EMOTICONS_EMO.values():
        text = text.replace(emoticon, "")
    return text

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', ' <num> ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r"[!@#$\[\]()']", "", text)

    with open('vietnamese-stopwords.txt', "r", encoding="utf-8") as f:
        stopwords = set(f.read().split("\n"))
    
    words = word_tokenize(text)
    text = " ".join(word for word in words if word not in stopwords)
    return text


In [47]:
df_test = df.sample(n=15000, random_state=42)
df_test

Unnamed: 0,content,label
2604,Vẫn tặng bạn 5* vì nhiệt tình,0
16959,Dù có 5k nhưg mất uy tín,1
19018,Hàng đẹp dã man chị trang ạ,0
20374,Và con nào ăn thì con đấy chết,2
11124,Chất lượng tốt có nhiều quà tặng kèm,0
...,...,...
9676,Có giảm nhưng uống vào người rất mệt và buồn nôn,1
622,Đóng gói sản phẩm rất đẹp và chắc chắn Shop ph...,0
6249,Đây chỉ là góp ý và không nỡ để shop bị rate t...,1
926,"Chất vải và kiểu áo đều ok, dễ mặc",0


In [48]:
df_test['content'] = df_test['content'].apply(remove_emoji)  
df_test['content'] = df_test['content'].apply(clean_text)  

In [49]:
train_df, val_df = train_test_split(df_test, test_size=0.2, random_state=42)

In [50]:
from datasets import Dataset
from transformers import DataCollatorWithPadding

checkpoint = 'distilbert-base-multilingual-cased'
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples["content"], truncation=True, padding=True, max_length=128)

In [51]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [52]:
# Loại bỏ các cột không cần thiết
train_dataset = train_dataset.remove_columns(["content", "__index_level_0__", "attention_mask"])
val_dataset = val_dataset.remove_columns(["content", "__index_level_0__", "attention_mask"])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=10, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=10, collate_fn=data_collator)

In [53]:
train_dataset

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 12000
})

In [54]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=10, collate_fn=data_collator)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=10, collate_fn=data_collator)

In [55]:
for batch in train_loader:
    print(batch.keys())
    break 

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [56]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, max_length):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, output_size)
        )
        
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Take the last output of the LSTM
        return out


In [57]:
# HYPER PARAMS

vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_size = 64
output_size = 3
max_length = 128
num_epochs = 10
device = device

print(f"Vocabulary size: {vocab_size}")
print(f"Embedding dimension: {embedding_dim}")
print(f"Hidden size: {hidden_size}")
print(f"Output size: {output_size}")
print(f"Max length: {max_length}")
print(f"Number of epochs: {num_epochs}")

Vocabulary size: 119547
Embedding dimension: 128
Hidden size: 64
Output size: 3
Max length: 128
Number of epochs: 10


In [60]:
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10, best_model_path=None):
    best_val_accuracy = 0.0
    
    for epoch in range(num_epochs):
        model.train()
        running_train_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        
        # Training loop
        for batch in train_loader:
            inputs, labels = batch['input_ids'], batch['labels']
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
        
        train_loss = running_train_loss / len(train_loader)
        train_accuracy = correct_predictions / total_predictions
        val_loss, val_accuracy, val_f1 = evaluate(model, val_loader, criterion, device)  # Now unpack 3 values
        
        # Save the best model
        if val_accuracy > best_val_accuracy and best_model_path:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), best_model_path)
        
        print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f} Train Accuracy: {train_accuracy:.4f} "
              f"Val Loss: {val_loss:.4f} Val Accuracy: {val_accuracy:.4f} Val F1: {val_f1:.4f}")
    
    print("Training complete")
from sklearn.metrics import f1_score

def evaluate(model, loader, criterion, device):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    running_val_loss = 0.0
    all_labels = []
    all_preds = []
    
    with torch.no_grad():
        for batch in loader:
            inputs, labels = batch['input_ids'], batch['labels']
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())
    
    val_loss = running_val_loss / len(loader)
    val_accuracy = correct_predictions / total_predictions
    f1 = f1_score(all_labels, all_preds, average='weighted')  # Calculate F1 score
    
    return val_loss, val_accuracy, f1  # Return 3 values now


In [61]:
model = LSTMModel(vocab_size, embedding_dim, hidden_size, output_size, max_length).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=20)

Epoch [1/20] Train Loss: 0.9109 Train Accuracy: 0.6410 Val Loss: 0.8998 Val Accuracy: 0.6367 Val F1: 0.4953
Epoch [2/20] Train Loss: 0.9047 Train Accuracy: 0.6418 Val Loss: 0.8989 Val Accuracy: 0.6367 Val F1: 0.4953
Epoch [3/20] Train Loss: 0.9041 Train Accuracy: 0.6419 Val Loss: 0.9010 Val Accuracy: 0.6367 Val F1: 0.4953
Epoch [4/20] Train Loss: 0.8498 Train Accuracy: 0.6509 Val Loss: 0.6958 Val Accuracy: 0.7333 Val F1: 0.6804
Epoch [5/20] Train Loss: 0.6532 Train Accuracy: 0.7404 Val Loss: 0.6345 Val Accuracy: 0.7647 Val F1: 0.7110
Epoch [6/20] Train Loss: 0.5880 Train Accuracy: 0.7719 Val Loss: 0.6211 Val Accuracy: 0.7703 Val F1: 0.7162
Epoch [7/20] Train Loss: 0.5449 Train Accuracy: 0.7900 Val Loss: 0.6291 Val Accuracy: 0.7687 Val F1: 0.7260
Epoch [8/20] Train Loss: 0.5045 Train Accuracy: 0.8048 Val Loss: 0.6426 Val Accuracy: 0.7630 Val F1: 0.7366
Epoch [9/20] Train Loss: 0.4676 Train Accuracy: 0.8196 Val Loss: 0.6737 Val Accuracy: 0.7610 Val F1: 0.7380
Epoch [10/20] Train Loss: 0.

In [66]:
# Hàm lưu mô hình
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Mô hình đã được lưu tại {path}")

# Hàm tải mô hình
def load_model(model, path, device):
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    print(f"Mô hình đã được nạp từ {path}")

# Hàm tiền xử lý và mã hóa câu đầu vào
def preprocess_input(sentence, tokenizer, max_length=128):
    sentence = remove_emoji(sentence)  # Loại bỏ emoji
    sentence = clean_text(sentence)   # Làm sạch văn bản
    tokens = tokenizer(sentence, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
    return tokens['input_ids']

# Hàm kiểm tra mô hình với một câu đầu vào
def test_model(model, tokenizer, sentence, device):
    model.eval()
    input_ids = preprocess_input(sentence, tokenizer).to(device)
    
    with torch.no_grad():
        outputs = model(input_ids)
        _, predicted_label = torch.max(outputs, 1)
    
    label_map = {0: "Tích cực (POS)", 1: "Trung tính (NEU)", 2: "Tiêu cực (NEG)"}
    return label_map[predicted_label.item()]

# Cập nhật hàm train để lưu mô hình tốt nhất và cuối cùng
best_model_path = "./model/best_lstm_model.pth"

train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10,  best_model_path=best_model_path)

# Nạp lại mô hình tốt nhất


Epoch [1/10] Train Loss: 0.2226 Train Accuracy: 0.9316 Val Loss: 0.9977 Val Accuracy: 0.7493 Val F1: 0.7437
Epoch [2/10] Train Loss: 0.2105 Train Accuracy: 0.9393 Val Loss: 1.0501 Val Accuracy: 0.7417 Val F1: 0.7377
Epoch [3/10] Train Loss: 0.2092 Train Accuracy: 0.9373 Val Loss: 1.0703 Val Accuracy: 0.7340 Val F1: 0.7277
Epoch [4/10] Train Loss: 0.2036 Train Accuracy: 0.9380 Val Loss: 1.0961 Val Accuracy: 0.7497 Val F1: 0.7400
Epoch [5/10] Train Loss: 0.1925 Train Accuracy: 0.9415 Val Loss: 1.0931 Val Accuracy: 0.7470 Val F1: 0.7411
Epoch [6/10] Train Loss: 0.1924 Train Accuracy: 0.9399 Val Loss: 1.0990 Val Accuracy: 0.7477 Val F1: 0.7404
Epoch [7/10] Train Loss: 0.1835 Train Accuracy: 0.9451 Val Loss: 1.1352 Val Accuracy: 0.7193 Val F1: 0.7182
Epoch [8/10] Train Loss: 0.1891 Train Accuracy: 0.9413 Val Loss: 1.1440 Val Accuracy: 0.7420 Val F1: 0.7358
Epoch [9/10] Train Loss: 0.1713 Train Accuracy: 0.9488 Val Loss: 1.1390 Val Accuracy: 0.7463 Val F1: 0.7401
Epoch [10/10] Train Loss: 0.

In [74]:
load_model(model, best_model_path, device)

# Ví dụ kiểm tra với các câu đầu vào
test_sentences = [
    '''Tối nay có bạn nào đi nhậu cho vui ko nào. Mình năm nhất nam nek, bn nào đi lên kèo nào '''
]

print("\nKết quả thử nghiệm:")
for sentence in test_sentences:
    prediction = test_model(model, tokenizer, sentence, device)
    print(f"Câu: \"{sentence}\" --> Dự đoán: {prediction}")


Mô hình đã được nạp từ ./model/best_lstm_model.pth

Kết quả thử nghiệm:
Câu: "Tối nay có bạn nào đi nhậu cho vui ko nào. Mình năm nhất nam nek, bn nào đi lên kèo nào " --> Dự đoán: Trung tính (NEU)


  model.load_state_dict(torch.load(path, map_location=device))
