In [1]:
# BATCH_SIZE = 100
SEED = 61

import re
import time
import string
import pickle
import random
import numpy as np
import pandas as pd
import tensorflow as tf
# import tensorflow_datasets as tfds
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
# tf.keras.utils.set_random_seed(SEED)

# from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Flatten
# from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras import layers, Sequential

# from gensim.models.fasttext import FastText
# from gensim.models import Word2Vec

# import fasttext
# import fasttext.util

import torch
from transformers import AutoModel, AutoTokenizer

# from tqdm.notebook import tqdm_notebook

# from tqdm import tqdm

from tqdm import tqdm, notebook



# tqdm_notebook.pandas()

In [2]:
train_path = '../input/cleaned-sentiment-text/train.csv'#'data/cleaned/train.csv'
validation_path = '../input/cleaned-sentiment-text/validation.csv'#'data/cleaned/validation.csv'
test_path = '../input/cleaned-sentiment-text/test.csv'#'data/cleaned/test.csv'

In [3]:
# data is already cleaned and splitted in 3 parts: train, validation and test
train_df = pd.read_csv(train_path, index_col=0)
validation_df = pd.read_csv(validation_path, index_col=0)
test_df = pd.read_csv(test_path, index_col=0)
NUM_CLASSES = len(train_df['label'].unique())
index2class = {0:'neg', 1:'pos'}
class2index = {'neg': 0, 'pos':1}

# X
train_data = train_df.iloc[:, 0].to_numpy()
validation_data = validation_df.iloc[:, 0].to_numpy()
test_data = test_df.iloc[:, 0].to_numpy()
whole_data = np.concatenate((train_data, validation_data, test_data), dtype=object)

# y
label_train = np.array([class2index[i] for i in train_df.iloc[:, 1]])
label_validation = np.array([class2index[i] for i in validation_df.iloc[:, 1]])
label_test = np.array([class2index[i] for i in test_df.iloc[:, 1]])
label_whole = np.concatenate((label_train, label_validation, label_test))

print('Number of train data:', train_df.shape[0])
print('Number of validation data:', validation_df.shape[0])
print('Number of test data:', test_df.shape[0])
print('Number of classes:', NUM_CLASSES, '->', train_df['label'].unique())

# MAXLEN = max([len(x.split()) for x in whole_data]) # = 492
# EMBEDDING_DIM = 100

train_df.head()

Number of train data: 29744
Number of validation data: 9920
Number of test data: 9920
Number of classes: 2 -> ['neg' 'pos']


Unnamed: 0,text,label
0,hôm_nay đi ngang quán quyết_định ghé mua quán ...,neg
1,đến_súp lơ trưa nắng gắt chủ_nhật hy_vọng súp ...,pos
2,hôm_qua xe đông khiếp lo bàn may_sao bàn lầu l...,pos
3,món ăn_ở ngon món bò kho chủ quán quán phục_vụ...,pos
4,đẹp trời đi mỳ mỳ vô_cùng mặn sợi bở bàn phục_...,neg


In [4]:
def load_bert():
    v_phobert = AutoModel.from_pretrained("vinai/phobert-base")
    v_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
    return v_phobert, v_tokenizer
phobert, tokenizer = load_bert()

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/518M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def phobert_embed_sentence(padded, mask, model=phobert):
    # embed a single setence data
    # param padded: a tokenized, padded sentence
    # param mask: an attention mask of the padded sentence
    padded = torch.tensor(padded).to(torch.long)
    mask = torch.tensor(mask)
    with torch.no_grad():
        last_hidden_states = model(input_ids=padded, attention_mask=mask)[0]
    vector = last_hidden_states[:, 0, :].numpy() # [:,0,:] to get embedding vector of the first output token [CLS]
    return vector.flatten()

def phobert_embed_data(data, tokenizer=tokenizer):
    # embed the whole dataset with phobert
    # param padded_data: tokenized, padded dataset
    # param mask_data: attention masks of the padded dataset
    MAX_LENGTH = tokenizer.model_max_length # phobert default max sequence length = 256

    embedded_data = np.array([])
    for line in tqdm(data):
        tokenized_line = tokenizer.encode(line, max_length=MAX_LENGTH, truncation=True)
        # pad sentence to a pre-defined max length, no truncating since it is already truncated in the phobert tokenizing
        padded_line = pad_sequences([tokenized_line], maxlen=MAX_LENGTH, padding='post', value=1)
        # Get attention mask from padded sentence of data to make PhoBERT focus on non-padded data only
        # pad tokenized sentence with value = 1, since 1 is pre-defined padding value of PhoBERT
        mask = np.where(padded_line == 1, 0, 1)

        embedded_line = phobert_embed_sentence(padded_line, mask)
        
        if embedded_data.shape[0] == 0:
            embedded_data = np.empty((0, embedded_line.shape[0]), 'float32')
            
        embedded_data = np.concatenate((embedded_data, [embedded_line]))
    return embedded_data

In [6]:
phobert_train = phobert_embed_data(train_data)
np.savetxt('./PhoBertEmbeddingTrain.txt', phobert_train)

100%|██████████| 29744/29744 [3:26:57<00:00,  2.40it/s]


In [7]:
# phobert_validation = phobert_embed_data(validation_data)
# np.savetxt('./PhoBertEmbeddingValidation.txt', phobert_validation)

In [8]:
# phobert_test = phobert_embed_data(test_data)
# np.savetxt('./PhoBertEmbeddingTest.txt', phobert_test)

In [9]:
# # load data
# phobert_train = np.loadtxt('./PhoBertEmbeddingTrain.txt')
# phobert_validation = np.loadtxt('./PhoBertEmbeddingValidation.txt')
# phobert_test = np.loadtxt('./PhoBertEmbeddingTest.txt')

In [10]:
# ref: https://miai.vn/2020/12/29/bert-series-chuong-3-thu-nhan-dien-cam-xuc-van-ban-tieng-viet-voi-phobert-cach-1/

# Drafts

In [11]:
# # Chuyển thành tensor
# temp_padded = torch.tensor(padded_test).to(torch.long)
# print("Pad = ",temp_padded.size())
# temp_mask = torch.tensor(mask_test)

# # Lấy features dầu ra từ BERT
# with torch.no_grad():
# #     last_hidden_states = phobert(input_ids=temp_padded[:2], attention_mask=temp_mask[:2])
#     last_hidden_states = phobert(input_ids=temp_padded, attention_mask=temp_mask)

# v_features = last_hidden_states[0][:, 0, :].numpy()
# # emb_vecs = features[0]#.cpu().numpy()[0] #[1:-1]