In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib
import sys
import os

# Th√™m th∆∞ m·ª•c cha c·ªßa notebooks v√†o sys.path
sys.path.append(os.path.abspath('..'))

from utils.embedding import get_phobert_embedding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω
df = pd.read_csv('../data/processed/cleaned_reviews.csv')

# Load PhoBERT
phobert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
phobert.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [3]:
# T·∫°o embeddings
embeddings = np.array([get_phobert_embedding(text, phobert, tokenizer, device) for text in df['review_cleaned']])
np.save('../data/processed/embeddings.npy', embeddings)
np.save('../data/processed/labels.npy', df['label'].values)

In [4]:
# Chia train/test
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']
)

# Hu·∫•n luy·ªán c√°c model
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(kernel='rbf')
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results[name] = acc
    print(f"{name}: {100 * acc:.4f}%")

Logistic Regression: 93.9467%
Random Forest: 89.3462%
SVM: 92.2518%


In [5]:
# 03_training.ipynb

# Ch·ªçn m√¥ h√¨nh t·ªët nh·∫•t
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
joblib.dump(best_model, '../models/best_sentiment_model.pkl')
print(f"üèÜ Best model: {best_model_name}")


üèÜ Best model: Logistic Regression
