# Install and import

In [1]:
!apt-get update > /dev/null
!apt-get install -y openjdk-11-jdk > /dev/null

W: https://packages.cloud.google.com/apt/dists/gcsfuse-focal/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.
W: https://packages.cloud.google.com/apt/dists/google-fast-socket/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.


In [2]:
!pip install -q py_vncorenlp

In [3]:
import numpy as np
import pandas as pd
import py_vncorenlp
import contextlib
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader

# Set up VNCoreNLP

In [4]:
!mkdir /kaggle/working/vncorenlp
py_vncorenlp.download_model(save_dir='./vncorenlp/')

--2024-10-31 03:39:43--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.2.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412703 (26M) [application/octet-stream]
Saving to: 'VnCoreNLP-1.2.jar'

     0K .......... .......... .......... .......... ..........  0% 4.01M 7s
    50K .......... .......... .......... .......... ..........  0% 19.9M 4s
   100K .......... .......... .......... .......... ..........  0% 10.6M 3s
   150K .......... .......... .......... .......... ..........  0% 43.4M 3s
   200K .......... .......... .......... .......... ..........  0% 52.0M 2s
   250K .......... .......... .......... .......... ..........  1% 55.7M 2s
   300K .......... .......... .......... .......... ..........  1% 13.7M 2s
   350K ..

In [5]:
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/kaggle/working/vncorenlp')

2024-10-31 03:39:50 INFO  WordSegmenter:24 - Loading Word Segmentation model


# Preprocess data

In [6]:
df = pd.read_csv('/kaggle/input/vietnamese-text-classification-dataset/train.csv', names=['label', 'comment'])
df.head()

Unnamed: 0,label,comment
0,0,máy dùng hay bị đơ máy
1,0,chỉ có dây cáp nguồn không có adapter sao sử d...
2,0,Chất lượng quá kém Mới dùng được 2 ngày loa ba...
3,0,Usb tôi vừa mới nhận usb này Rất bực bội vì cá...
4,2,Tuyệt vời. Hàng FPT cửa hàng


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    3040 non-null   int64 
 1   comment  3040 non-null   object
dtypes: int64(1), object(1)
memory usage: 47.6+ KB


In [8]:
df.isnull().sum()

label      0
comment    0
dtype: int64

In [9]:
df.isna().sum()

label      0
comment    0
dtype: int64

In [10]:
df = df.drop_duplicates('comment')
df = df.reset_index(drop=True)
df.shape

(2998, 2)

In [11]:
def wseg(text):
    return ' '.join(rdrsegmenter.word_segment(text))

In [12]:
df['comment'] = df['comment'].apply(wseg)

In [13]:
df

Unnamed: 0,label,comment
0,0,máy dùng hay bị đơ máy
1,0,chỉ có dây_cáp nguồn không có adapter sao sử_d...
2,0,Chất_lượng quá kém Mới dùng được 2 ngày loa ba...
3,0,Usb tôi vừa_mới nhận usb này Rất bực_bội vì cá...
4,2,Tuyệt_vời . Hàng FPT cửa_hàng
...,...,...
2993,1,Nhanh hết pin Không biết phải lỗi hay không Mà...
2994,0,không có bộ thu đi kèm Vừa nhận hàng xong tức_...
2995,0,Cũng bình_thường Không mạnh như kỹ vọng Chắc t...
2996,2,tốt tốt đang tiền khi bỏ ra để mua giao hàng n...


In [14]:
from statistics import mode
length = []
for cmt in df['comment']:
    length.append(len(cmt.split()))
print(mode(length))
print(max(length))

15
319


# Model

In [15]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base-v2", num_labels=3)

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def preprocess_data(texts, labels, tokenizer, max_length=128):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    inputs = torch.tensor(encodings['input_ids'])
    attention_masks = torch.tensor(encodings['attention_mask'])
    labels = torch.tensor(labels)
    return inputs, attention_masks, labels

In [17]:
def train(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [18]:
def evaluate(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).tolist())
            true_labels.extend(labels.tolist())
    accuracy = accuracy_score(true_labels, preds)
    report = classification_report(true_labels, preds)
    return accuracy, report

In [19]:
train_texts = list(df['comment'])
train_labels = list(df['label'])
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1)

In [20]:
# Convert data into PyTorch Dataset
train_inputs, train_masks, train_labels = preprocess_data(train_texts, train_labels, tokenizer)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

val_inputs, val_masks, val_labels = preprocess_data(val_texts, val_labels, tokenizer)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_loader = DataLoader(val_data, batch_size=16, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
for epoch in range(10):  # Train for 10 epochs
    avg_loss = train(model, train_loader, optimizer)
    print(f"Epoch {epoch+1} | Average Loss: {avg_loss}")

# Evaluation
eval_accuracy, eval_report = evaluate(model, val_loader)
print(f"Evaluation Accuracy: {eval_accuracy}")
print(eval_report)

100%|██████████| 169/169 [15:03<00:00,  5.35s/it]


Epoch 1 | Average Loss: 0.6430620402097702


100%|██████████| 169/169 [14:35<00:00,  5.18s/it]


Epoch 2 | Average Loss: 0.3652138863442212


100%|██████████| 169/169 [14:52<00:00,  5.28s/it]


Epoch 3 | Average Loss: 0.2621481552411466


100%|██████████| 169/169 [14:51<00:00,  5.28s/it]


Epoch 4 | Average Loss: 0.19272826826916292


100%|██████████| 169/169 [15:11<00:00,  5.39s/it]


Epoch 5 | Average Loss: 0.1279099588365068


100%|██████████| 169/169 [15:01<00:00,  5.33s/it]


Epoch 6 | Average Loss: 0.10284336677272997


100%|██████████| 169/169 [15:14<00:00,  5.41s/it]


Epoch 7 | Average Loss: 0.0663914659539976


100%|██████████| 169/169 [14:47<00:00,  5.25s/it]


Epoch 8 | Average Loss: 0.07445729993140468


100%|██████████| 169/169 [14:47<00:00,  5.25s/it]


Epoch 9 | Average Loss: 0.05535789325932515


100%|██████████| 169/169 [14:58<00:00,  5.32s/it]


Epoch 10 | Average Loss: 0.06087296278237988
Evaluation Accuracy: 0.8266666666666667
              precision    recall  f1-score   support

           0       0.89      0.83      0.86       102
           1       0.71      0.65      0.68        78
           2       0.84      0.93      0.89       120

    accuracy                           0.83       300
   macro avg       0.82      0.81      0.81       300
weighted avg       0.83      0.83      0.82       300



In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Sau khi đã fine-tune mô hình
model.save_pretrained("phobert_sa")
tokenizer.save_pretrained("phobert_sa")

('phobert_sa/tokenizer_config.json',
 'phobert_sa/special_tokens_map.json',
 'phobert_sa/vocab.txt',
 'phobert_sa/bpe.codes',
 'phobert_sa/added_tokens.json')

In [22]:
import torch

text = "Tôi rất hài lòng với dịch vụ"

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

model.eval()

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

predicted_class = torch.argmax(logits, dim=1).item()

print(f"Predicted class: {predicted_class}")

Predicted class: 2
