In [None]:
!pip install --quiet datasets torch tokenizers scikit-learn pandas

In [None]:
import torch
import torch.nn as nn
from datasets import load_dataset
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import numpy as np
import re
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os

In [None]:
!pip install --quiet -U datasets huggingface_hub fsspec

In [None]:

# Tải bộ dữ liệu mteb/amazon_polarity
dataset = load_dataset("mteb/amazon_polarity")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.71k [00:00<?, ?B/s]

data/train-00000-of-00004.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

data/train-00001-of-00004.parquet:   0%|          | 0.00/253M [00:00<?, ?B/s]

data/train-00002-of-00004.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

data/train-00003-of-00004.parquet:   0%|          | 0.00/250M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/115M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [None]:
def standardize_labels(label):
    return 1 if int(label) == 1 else 0 if int(label) == 0 else -1

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-z0-9\s]', '', text.lower())
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
# Chuẩn hóa dữ liệu
def preprocess_dataset(dataset):
    texts = []
    labels = []
    for split in dataset.keys():
        for item in dataset[split]:
            text = item['text']
            label = item['label']
            processed_text = preprocess_text(str(text))
            standardized_label = standardize_labels(label)
            if standardized_label != -1:
                texts.append(processed_text)
                labels.append(standardized_label)
    return pd.DataFrame({'text': texts, 'label': labels})

In [None]:
# Xử lý bộ dữ liệu
df = preprocess_dataset(dataset)
df = df.sample(n=40000, random_state=42)  # Lấy 40,000 mẫu
df = df[df['label'].isin([0, 1])]  # Chỉ giữ tích cực/tiêu cực

In [None]:
# Chia dữ liệu: 70% train, 10% validation, 20% test
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.125, random_state=42)  # 0.125 * 0.8 = 0.1

In [None]:
# Khởi tạo tokenizer
vocab_size = 5000
sequence_length = 128
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(pad_id=1, pad_token="<pad>", length=sequence_length)
tokenizer.enable_truncation(max_length=sequence_length)

In [None]:
# Xây dựng vocabulary
trainer = WordLevelTrainer(vocab_size=vocab_size, special_tokens=["<unk>", "<pad>"])
tokenizer.train_from_iterator(train_df['text'].values, trainer=trainer)


In [None]:
# Vectorization
def vectorize(sentence, tokenizer):
    output = tokenizer.encode(sentence)
    return torch.tensor(output.ids, dtype=torch.long)


In [None]:
# Tạo dữ liệu vector hóa
train_corpus_ids = [vectorize(sentence, tokenizer) for sentence in train_df['text']]
train_labels = torch.tensor(train_df['label'].values, dtype=torch.long)
val_corpus_ids = [vectorize(sentence, tokenizer) for sentence in val_df['text']]
val_labels = torch.tensor(val_df['label'].values, dtype=torch.long)
test_corpus_ids = [vectorize(sentence, tokenizer) for sentence in test_df['text']]
test_labels = torch.tensor(test_df['label'].values, dtype=torch.long)

In [None]:
# Tạo dataset tùy chỉnh
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, corpus_ids, labels):
        self.corpus_ids = corpus_ids
        self.labels = labels

    def __len__(self):
        return len(self.corpus_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.corpus_ids[idx],
            'labels': self.labels[idx]
        }

In [None]:

train_dataset = SentimentDataset(train_corpus_ids, train_labels)
val_dataset = SentimentDataset(val_corpus_ids, val_labels)
test_dataset = SentimentDataset(test_corpus_ids, test_labels)


In [None]:
# Xây dựng mô hình với Bidirectional LSTM
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, dropout=0.5):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)  # *2 vì bidirectional
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids):
        x = self.embedding(input_ids)  # (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(x)  # (batch_size, seq_len, hidden_dim * 2)
        x = lstm_out[:, -1, :]  # Lấy hidden state cuối cùng
        x = self.batch_norm(x)
        x = self.dropout(x)
        x = self.fc(x)
        return x


In [None]:
# Khởi tạo mô hình
embedding_dim = 256
hidden_dim = 512
num_classes = 2
model = SentimentClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)


In [None]:

# Thiết lập hàm mất mát, optimizer và scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)



In [None]:
# Early Stopping
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
            self.best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True


In [None]:
# Kiểm tra GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# Tạo DataLoader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)


In [None]:
# Hàm huấn luyện
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, early_stopping, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        avg_train_loss = total_train_loss / len(train_loader)

        # Đánh giá trên tập validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)

        print(f'Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

        # Cập nhật scheduler và early stopping
        scheduler.step(avg_val_loss)
        early_stopping(avg_val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping triggered!")
            model.load_state_dict(early_stopping.best_model_state)
            break

        model.train()

In [None]:
# Hàm đánh giá
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Khởi tạo Early Stopping
early_stopping = EarlyStopping(patience=3, delta=0.001)

In [None]:
# Huấn luyện mô hình
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, early_stopping, num_epochs=10)

Epoch 1, Train Loss: 0.6802, Val Loss: 0.6631
Epoch 2, Train Loss: 0.4232, Val Loss: 0.4496
Epoch 3, Train Loss: 0.3421, Val Loss: 0.3685
Epoch 4, Train Loss: 0.2995, Val Loss: 0.4744
Epoch 5, Train Loss: 0.2705, Val Loss: 0.3978
Epoch 6, Train Loss: 0.2290, Val Loss: 0.3339
Epoch 7, Train Loss: 0.2216, Val Loss: 0.3505
Epoch 8, Train Loss: 0.1960, Val Loss: 0.3228
Epoch 9, Train Loss: 0.1670, Val Loss: 0.3533
Epoch 10, Train Loss: 0.1385, Val Loss: 0.3854


In [None]:

# Đánh giá mô hình trên tập test
eval_results = evaluate_model(model, test_loader)
print("Evaluation results:", eval_results)

Evaluation results: {'accuracy': 0.870625, 'f1': 0.8705309667087047, 'precision': 0.8718674413966125, 'recall': 0.870625}


In [None]:
# Lưu mô hình và tokenizer
os.makedirs("./sentiment_model", exist_ok=True)
torch.save(model.state_dict(), "./sentiment_model/model.pt")
tokenizer.save("./sentiment_model/tokenizer.json")


In [None]:
# Nén thư mục mô hình để tải về
!zip -r sentiment_model.zip ./sentiment_model


  adding: sentiment_model/ (stored 0%)
  adding: sentiment_model/model.pt (deflated 7%)
  adding: sentiment_model/tokenizer.json (deflated 70%)


In [None]:
# Tạo liên kết tải về
from google.colab import files
files.download('sentiment_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>