In [19]:
try:
    import google.colab
    IN_COLAB = True
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    IN_COLAB = False

In [20]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Paths
# drive path
if IN_COLAB:
    PROJECT_ROOT = "/content/drive/MyDrive/projectStuff"
else:
    DIR_PATH = os.getcwd()
    PROJECT_ROOT = os.path.dirname(DIR_PATH)

print(f"Current directory: {DIR_PATH}")
print(f"Project root: {PROJECT_ROOT}")
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "reviews_clean.csv")
MODEL_NAME = "vinai/phobert-base"
CACHE_DIR = "/content/cache" if IN_COLAB else os.path.join(PROJECT_ROOT, "cache")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "models", "sentiment")

Current directory: D:\Project\MajorProject\notebooks
Project root: D:\Project\MajorProject


In [12]:
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
print(df.head())

label2id = {"NEG": 0, "NEU": 1, "POS": 2}
id2label = {v: k for k, v in label2id.items()}

# Map string labels -> integers
df["label"] = df["label"].map(label2id)

print(df["label"].value_counts())

train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))

Shape: (31460, 4)
                      comment label  rate Unnamed: 3
0              Áo bao đẹp ạ!!   POS     5        NaN
1                 Tuyệt vời !   POS     5        NaN
2  2day ao khong giong trong.   NEG     1        NaN
3     Mùi thơm,bôi da mềm da.   POS     5        NaN
4           Vải đẹp, dày dặn.   POS     5        NaN
label
2    20093
0     6669
1     4698
Name: count, dtype: int64


In [17]:
print("Dataset Info:")
print(f"Total samples: {len(df)}")
print(f"Label distribution:")
for label, count in df["label"].value_counts().sort_index().items():
    percentage = count / len(df) * 100
    print(f"  {id2label[label]}: {count} ({percentage:.1f}%)")

print(f"\nSample comments:")
for i in range(3):
    label = df.iloc[i]["label"]
    comment = df.iloc[i]["comment"]
    print(f"  [{id2label[label]}]: {comment[:100]}...")

Dataset Info:
Total samples: 31460
Label distribution:
  NEG: 6669 (21.2%)
  NEU: 4698 (14.9%)
  POS: 20093 (63.9%)

Sample comments:
  [POS]: Áo bao đẹp ạ!!...
  [POS]: Tuyệt vời !...
  [NEG]: 2day ao khong giong trong....


In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=MODEL_DIR, use_fast=False)

def tokenize_fn(batch):
    return tokenizer(
        batch["comment"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds = val_ds.map(tokenize_fn, batched=True)

# Set format for torch
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

print(f"Training samples: {len(train_ds)}")
print(f"Validation samples: {len(val_ds)}")

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizer loaded. Vocab size: 64000


Map:   0%|          | 0/25168 [00:00<?, ? examples/s]

Map:   0%|          | 0/6292 [00:00<?, ? examples/s]

Training samples: 25168
Validation samples: 6292


In [15]:
# Add device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

num_labels = df["label"].nunique()
print(f"Number of labels: {num_labels}")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    cache_dir=MODEL_DIR,  # Fixed: removed quotes
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

model.to(device)

Using device: cuda
Number of labels: 3


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    prec = precision_score(labels, preds, average="weighted")
    rec = recall_score(labels, preds, average="weighted")

    return {
        "accuracy": round(acc, 4),
        "f1": round(f1, 4),
        "precision": round(prec, 4),
        "recall": round(rec, 4),
    }


In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=0.00004,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="../logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    bf16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5966,0.550766,0.7791,0.744,0.7442,0.7791
2,0.5507,0.5503,0.7846,0.7526,0.7624,0.7846
3,0.4703,0.538621,0.7864,0.7701,0.7626,0.7864


TrainOutput(global_step=1182, training_loss=0.5499175010397148, metrics={'train_runtime': 2306.8807, 'train_samples_per_second': 32.73, 'train_steps_per_second': 0.512, 'total_flos': 4966528873033728.0, 'train_loss': 0.5499175010397148, 'epoch': 3.0})

In [None]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

('/content/drive/MyDrive/projectStuff/models/sentiment/tokenizer_config.json',
 '/content/drive/MyDrive/projectStuff/models/sentiment/special_tokens_map.json',
 '/content/drive/MyDrive/projectStuff/models/sentiment/vocab.txt',
 '/content/drive/MyDrive/projectStuff/models/sentiment/bpe.codes',
 '/content/drive/MyDrive/projectStuff/models/sentiment/added_tokens.json')