In [None]:
!pip install underthesea

In [2]:
import time

import numpy as np
import regex
from sklearn.model_selection import train_test_split
from underthesea import word_tokenize
import gensim
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

In [3]:
with open(f"/content/stop_word_vn.txt", encoding="utf8") as f:
    stop_word_pre = f.read().splitlines()


def no_accent_vietnamese(s):
    s = regex.sub(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', s)
    s = regex.sub(r'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', s)
    s = regex.sub(r'[èéẹẻẽêềếệểễ]', 'e', s)
    s = regex.sub(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = regex.sub(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', s)
    s = regex.sub(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', s)
    s = regex.sub(r'[ìíịỉĩ]', 'i', s)
    s = regex.sub(r'[ÌÍỊỈĨ]', 'I', s)
    s = regex.sub(r'[ùúụủũưừứựửữ]', 'u', s)
    s = regex.sub(r'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', s)
    s = regex.sub(r'[ỳýỵỷỹ]', 'y', s)
    s = regex.sub(r'[ỲÝỴỶỸ]', 'Y', s)
    s = regex.sub(r'[Đ]', 'D', s)
    s = regex.sub(r'[đ]', 'd', s)
    return s

stop_word = set()
for a in stop_word_pre:
  stop_word.add(a)
  stop_word.add(no_accent_vietnamese(a))

In [5]:
VN_DATA_DIRECT = '/content/sentiment_analysis_train.v1.0.txt'

# with open(f"/content/stop_word_vn.txt", encoding="utf8") as f:
#     stop_word = f.read().splitlines()


def remove_character_not_ness(document):
    document = regex.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]', ' ', document)
    document = regex.sub(r'[^\D]', ' ', document)
    document = regex.sub(r'\s+', ' ', document).strip()
    return document


def remove_stopwords(document):
    words = []
    for word in document.strip().split():
        if word not in stop_word:
            words.append(word)
    return ' '.join(words)


def text_preprocess(document):
    document = document.lower()
    document = word_tokenize(document, format="text")
    document = remove_character_not_ness(document)
    # document = remove_stopwords(document)
    return document


text = []
label = []


def load_training_data(
        vn_directory
):
    with open(f"{vn_directory}", encoding="utf8") as f:
        data_set = f.read().splitlines()
        for data in data_set:
            document = data
            tmp = document.split(" ", 1)
            tmp[1] = text_preprocess(tmp[1])
            label.append(tmp[0])
            text.append(tmp[1])
            s = no_accent_vietnamese(tmp[1]).replace('_', ' ')
            if tmp[1] != s:
                label.append(tmp[0])
                text.append(s)


load_training_data(VN_DATA_DIRECT)

In [6]:
count = 0
for a in text:
  tmp = a.split(" ");
  count = max(count, len(tmp))
print(count)

1283


In [7]:
def txtTokenizer(texts):
    tokenizer = Tokenizer(num_words = count,lower=True)
    tokenizer.fit_on_texts(texts)

    word_index = tokenizer.word_index
    return tokenizer, word_index

tokenizer, word_index = txtTokenizer(text)
X = tokenizer.texts_to_sequences(text)
X = pad_sequences(X, count)
y = pd.get_dummies(label)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

In [8]:
sentences = [[item.lower() for item in doc.split()] for doc in text]
word_model = gensim.models.Word2Vec(sentences, size=300, min_count=1, iter=10)
embedding_matrix = np.zeros((len(word_model.wv.vocab) + 1, 300))
for i, vec in enumerate(word_model.wv.vectors):
  embedding_matrix[i] = vec

In [15]:
print(word_model.wv.most_similar('xấu'))

[('hẹp', 0.8238487243652344), ('bẩn', 0.7613018751144409), ('chật_chội', 0.7332699298858643), ('chật', 0.730948805809021), ('dơ', 0.7281186580657959), ('củ', 0.7233487367630005), ('bí', 0.7185693383216858), ('cũ_kĩ', 0.7178505659103394), ('tuy_vậy', 0.7023990154266357), ('có_vẻ', 0.6997995376586914)]


In [9]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
model = Sequential()
model.add(Embedding(len(word_model.wv.vocab)+1,300,input_length=X.shape[1],weights=[embedding_matrix],trainable=False))
model.add(LSTM(300,return_sequences=False))
model.add(Dense(y.shape[1],activation="softmax"))
model.summary()
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['acc'])

batch = 128
epochs = 14
model.fit(X_train,y_train,batch,epochs)
model.evaluate(X_test,y_test)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1283, 300)         10902900  
                                                                 
 lstm (LSTM)                 (None, 300)               721200    
                                                                 
 dense (Dense)               (None, 5)                 1505      
                                                                 
Total params: 11,625,605
Trainable params: 722,705
Non-trainable params: 10,902,900
_________________________________________________________________
Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14


[1.3098299503326416, 0.6887682676315308]

In [10]:
test = []
with open(f"/content/sentiment_analysis_test_unlabel.v1.0.txt", encoding="utf8") as f:
  data_set = f.read().splitlines()
  for data in data_set:
    data = text_preprocess(data)
    test.append(data)

In [13]:
seq = tokenizer.texts_to_sequences(test)
padded = pad_sequences(seq, count)
pred = model.predict(padded)
labels = ['__label__kem', '__label__rat_kem', '__label__tot', '__label__trung_binh', '__label__xuat_sac']
y = pd.get_dummies(label)
print(labels[np.argmax(pred[1])])

__label__trung_binh


In [16]:
for p in pred:
    ff = open(f"/content/result-multi.txt", "a", encoding="utf8")
    ff.write(labels[np.argmax(p)] + '\n')
    ff.close()