# Features extraction using PhoBERT

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import re

import torch
import torch.nn as nn
from torchinfo import summary
from torch.utils.data import TensorDataset, DataLoader

from transformers import AutoModel, AutoTokenizer
import underthesea

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_bert():
    phobert = AutoModel.from_pretrained('vinai/phobert-base')
    phobert_tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base', use_fast=False)
    return phobert, phobert_tokenizer

In [3]:
def preprocess(s):
    s = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})', '', s)
    s = re.sub(r'[^\w\s]', '', s)
    s = re.sub(r'\d', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s

In [4]:
with open('stopwords/vietnamese-stopwords.txt', encoding='utf-8') as f:
    stopwords = f.readlines()
    stopwords = [word.rstrip() for word in stopwords]

In [5]:
df = pd.read_csv('./data/vn_news_223_tdlfr.csv')
df.drop(columns=['domain'],inplace=True)
df

Unnamed: 0,text,label
0,Thủ tướng Abe cúi đầu xin lỗi vì hành động phi...,1
1,Thủ tướng Nhật cúi đầu xin lỗi vì tinh thần ph...,1
2,Choáng! Cơ trưởng đeo khăn quàng quẩy banh nóc...,1
3,Chưa bao giờ nhạc Kpop lại dễ hát đến thế!!!\r...,1
4,"Đại học Hutech sẽ áp dụng cải cách ""Tiếq Việt""...",1
...,...,...
218,“Siêu máy bay” A350 sẽ chở CĐV Việt Nam đi Mal...,0
219,Thưởng 20.000 USD cho đội tuyển cờ vua Việt Na...,0
220,Trường Sơn giành HCV tại giải cờ vua đồng đội ...,0
221,Chuyện về chàng sinh viên Luật - Kiện tướng Lê...,0


In [6]:
X_df = df.drop('label', axis=1)
y_df = df.label

In [7]:
phobert, phobert_tokenizer = load_bert()

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def make_bert_features(data, stop_words, bert, tokenizer):
    tokenized_list = list()
    max_len = 200
    for sequence in data.values:
        tokens = underthesea.word_tokenize(sequence[0])
        token_list = list()
        for token in tokens:
            token = preprocess(token)
            if token not in stop_words and token !='':
                token_list.append(token)
        sequence = ' '.join(token_list)
        sequence = underthesea.word_tokenize(sequence, format='text')
        tokenized = tokenizer.encode(sequence[:max_len])
        tokenized_list.append(tokenized)

    padding = np.array([i + [1]*(max_len - len(i)) for i in tokenized_list])
    print('Shape after padding: ', padding.shape)

    attention_mask = np.where(padding == 1, 0, 1)
    print('Attention mask shape: ', attention_mask.shape)

    padded = torch.tensor(padding).to(torch.long)
    print('Pad: ', padded.shape)
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = bert(input_ids=padded, attention_mask=attention_mask)
    features = last_hidden_states[0][:, 0, :].numpy()
    print(features.shape)
    return features

In [9]:
features = make_bert_features(X_df, stopwords, phobert, phobert_tokenizer)

Shape after padding:  (223, 200)
Attention mask shape:  (223, 200)
Pad:  torch.Size([223, 200])
(223, 768)


In [10]:
x_train, x_test, y_train, y_test = train_test_split(features,
                                                    y_df,
                                                    test_size=0.2,
                                                    stratify=y_df,
                                                    random_state=42)

In [11]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(178, 768)
(178,)
(45, 768)
(45,)


In [12]:
parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 2, 4], 'gamma': [0.125, 0.25, 0.5, 1, 2, 4]}
clf = GridSearchCV(SVC(random_state=42), param_grid=parameters)
grid_search = clf.fit(x_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print(grid_search.best_estimator_)

# best prarams
print('best prarams:', clf.best_params_)

Best score: 0.820
SVC(C=1, gamma=0.125, kernel='linear', random_state=42)
best prarams: {'C': 1, 'gamma': 0.125, 'kernel': 'linear'}


In [13]:
clf.best_estimator_.score(x_test, y_test)

0.8222222222222222

In [14]:
mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(1000),
    activation='relu',
    solver='adam',
    random_state=42,
    max_iter=10000
)

In [15]:
mlp_classifier.fit(x_train, y_train)

MLPClassifier(hidden_layer_sizes=1000, max_iter=10000, random_state=42)

In [16]:
mlp_classifier.score(x_test, y_test)

0.8666666666666667