<a href="https://colab.research.google.com/github/nguyenduongtri5703/sentiment_analysis/blob/main/Fine_Tuning_PHOBERT_From_HANDMADE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/dataset'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/dataset


In [None]:
!pip install transformers datasets scikit-learn



In [None]:
import pandas as pd

data = pd.read_csv('sentiment_data.csv')

# Map nhãn sang số
label_map = {'tiêu cực': 0, 'trung lập': 1, 'tích cực': 2}
data['label'] = data['sentiment'].map(label_map)
data = data[['comment', 'label']].dropna()

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['comment'].tolist(), data['label'].tolist(), test_size=0.3, random_state=42)

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [None]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

output_path = "/content/gdrive/MyDrive/dataset/results"
log_path = "/content/gdrive/MyDrive/dataset/logs"

training_args = TrainingArguments(
    output_dir=output_path,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    run_name="phobert_sentiment_run1",
    weight_decay=0.01,
    logging_dir=log_path,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,0.005598
2,No log,0.002489
3,0.101000,0.00201


TrainOutput(global_step=525, training_loss=0.09631392853600639, metrics={'train_runtime': 68.6463, 'train_samples_per_second': 61.183, 'train_steps_per_second': 7.648, 'total_flos': 34533636076800.0, 'train_loss': 0.09631392853600639, 'epoch': 3.0})

In [None]:
model.save_pretrained("/content/gdrive/MyDrive/model_history")
tokenizer.save_pretrained("/content/gdrive/MyDrive/model_history")

('/content/gdrive/MyDrive/model_history/tokenizer_config.json',
 '/content/gdrive/MyDrive/model_history/special_tokens_map.json',
 '/content/gdrive/MyDrive/model_history/vocab.txt',
 '/content/gdrive/MyDrive/model_history/bpe.codes',
 '/content/gdrive/MyDrive/model_history/added_tokens.json')