# Install and import

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/train.csv


In [2]:
!apt-get update > /dev/null
!apt-get install -y openjdk-11-jdk > /dev/null

W: https://packages.cloud.google.com/apt/dists/gcsfuse-focal/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.
W: https://packages.cloud.google.com/apt/dists/google-fast-socket/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.


In [3]:
!pip install -q py_vncorenlp

In [4]:
import numpy as np
import pandas as pd
import py_vncorenlp
import contextlib
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader

# Set up VNCoreNLP

In [5]:
!mkdir /kaggle/working/vncorenlp
py_vncorenlp.download_model(save_dir='./vncorenlp/')

mkdir: cannot create directory '/kaggle/working/vncorenlp': File exists
VnCoreNLP model folder ./vncorenlp already exists! Please load VnCoreNLP from this folder!


In [6]:
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/kaggle/working/vncorenlp')

2024-11-03 04:16:36 INFO  WordSegmenter:24 - Loading Word Segmentation model


# Preprocess data

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('/kaggle/input/train.csv', names=['label', 'comment'])
df.head()

Unnamed: 0,label,comment
0,0,máy dùng hay bị đơ máy
1,0,chỉ có dây cáp nguồn không có adapter sao sử d...
2,0,Chất lượng quá kém Mới dùng được 2 ngày loa ba...
3,0,Usb tôi vừa mới nhận usb này Rất bực bội vì cá...
4,2,Tuyệt vời. Hàng FPT cửa hàng


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    3040 non-null   int64 
 1   comment  3040 non-null   object
dtypes: int64(1), object(1)
memory usage: 47.6+ KB


In [10]:
df.isnull().sum()

label      0
comment    0
dtype: int64

In [11]:
df.isna().sum()

label      0
comment    0
dtype: int64

In [12]:
df = df.drop_duplicates('comment')
df = df.reset_index(drop=True)
df.shape

(2998, 2)

In [13]:
def wseg(text):
    return ' '.join(rdrsegmenter.word_segment(text))

In [14]:
df['comment'] = df['comment'].apply(wseg)

In [15]:
df

Unnamed: 0,label,comment
0,0,máy dùng hay bị đơ máy
1,0,chỉ có dây_cáp nguồn không có adapter sao sử_d...
2,0,Chất_lượng quá kém Mới dùng được 2 ngày loa ba...
3,0,Usb tôi vừa_mới nhận usb này Rất bực_bội vì cá...
4,2,Tuyệt_vời . Hàng FPT cửa_hàng
...,...,...
2993,1,Nhanh hết pin Không biết phải lỗi hay không Mà...
2994,0,không có bộ thu đi kèm Vừa nhận hàng xong tức_...
2995,0,Cũng bình_thường Không mạnh như kỹ vọng Chắc t...
2996,2,tốt tốt đang tiền khi bỏ ra để mua giao hàng n...


In [16]:
from statistics import mode
length = []
for cmt in df['comment']:
    length.append(len(cmt.split()))
print(mode(length))
print(max(length))

15
319


# Model

In [33]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

In [35]:
def preprocess_data(texts, labels, tokenizer, max_length=128):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    inputs = torch.tensor(encodings['input_ids'])
    attention_masks = torch.tensor(encodings['attention_mask'])
    labels = torch.tensor(labels)
    return inputs, attention_masks, labels

In [37]:
def train(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [38]:
def evaluate(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).tolist())
            true_labels.extend(labels.tolist())
    accuracy = accuracy_score(true_labels, preds)
    report = classification_report(true_labels, preds)
    return accuracy, report

In [21]:
train_texts = list(df['comment'])
train_labels = list(df['label'])
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1)

In [39]:
# Convert data into PyTorch Dataset
train_inputs, train_masks, train_labels = preprocess_data(train_texts, train_labels, tokenizer)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

val_inputs, val_masks, val_labels = preprocess_data(val_texts, val_labels, tokenizer)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_loader = DataLoader(val_data, batch_size=16, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
for epoch in range(20):  # Train for 20 epochs
    avg_loss = train(model, train_loader, optimizer)
    print(f"Epoch {epoch+1} | Average Loss: {avg_loss}")

# Evaluation
eval_accuracy, eval_report = evaluate(model, val_loader)
print(f"Evaluation Accuracy: {eval_accuracy}")
print(eval_report)

  labels = torch.tensor(labels)
100%|██████████| 169/169 [14:44<00:00,  5.24s/it]


Epoch 1 | Average Loss: 0.6327200009096303


100%|██████████| 169/169 [14:33<00:00,  5.17s/it]


Epoch 2 | Average Loss: 0.3481613379522894


100%|██████████| 169/169 [14:52<00:00,  5.28s/it]


Epoch 3 | Average Loss: 0.24750570150522086


100%|██████████| 169/169 [14:29<00:00,  5.15s/it]


Epoch 4 | Average Loss: 0.1682036109378881


100%|██████████| 169/169 [15:08<00:00,  5.38s/it]


Epoch 5 | Average Loss: 0.11600415728236796


100%|██████████| 169/169 [15:05<00:00,  5.36s/it]


Epoch 6 | Average Loss: 0.0876800640811317


100%|██████████| 169/169 [15:08<00:00,  5.37s/it]


Epoch 7 | Average Loss: 0.06996427767099977


100%|██████████| 169/169 [15:07<00:00,  5.37s/it]


Epoch 8 | Average Loss: 0.061687500877360975


100%|██████████| 169/169 [15:01<00:00,  5.33s/it]


Epoch 9 | Average Loss: 0.060441892891681406


100%|██████████| 169/169 [14:58<00:00,  5.32s/it]


Epoch 10 | Average Loss: 0.05259761822646892


100%|██████████| 169/169 [14:42<00:00,  5.22s/it]


Epoch 11 | Average Loss: 0.03802355587173849


100%|██████████| 169/169 [14:59<00:00,  5.32s/it]


Epoch 12 | Average Loss: 0.050278023399203224


100%|██████████| 169/169 [14:44<00:00,  5.23s/it]


Epoch 13 | Average Loss: 0.024576406868053433


100%|██████████| 169/169 [14:41<00:00,  5.21s/it]


Epoch 14 | Average Loss: 0.038275990540095596


100%|██████████| 169/169 [15:04<00:00,  5.35s/it]


Epoch 15 | Average Loss: 0.027150426015875043


100%|██████████| 169/169 [15:11<00:00,  5.39s/it]


Epoch 16 | Average Loss: 0.021618822041882434


100%|██████████| 169/169 [14:55<00:00,  5.30s/it]


Epoch 17 | Average Loss: 0.02129587347925084


100%|██████████| 169/169 [14:42<00:00,  5.22s/it]


Epoch 18 | Average Loss: 0.03385663366485277


100%|██████████| 169/169 [14:54<00:00,  5.29s/it]


Epoch 19 | Average Loss: 0.03577738093176427


100%|██████████| 169/169 [15:05<00:00,  5.36s/it]


Epoch 20 | Average Loss: 0.019808668634969984
Evaluation Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.89      0.82      0.85       114
           1       0.72      0.74      0.73        86
           2       0.90      0.95      0.92       100

    accuracy                           0.84       300
   macro avg       0.83      0.84      0.83       300
weighted avg       0.84      0.84      0.84       300



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Sau khi đã fine-tune mô hình
model.save_pretrained("phobert_sa")
tokenizer.save_pretrained("phobert_sa")