In [1]:
!pip install underthesea
!pip install imbalanced-learn



In [2]:
!pip install unidecode



In [3]:
import time

import numpy as np
import regex
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from underthesea import word_tokenize
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import unidecode


In [4]:
with open(f"/stop_word_vn.txt", encoding="utf8") as f:
    stop_word_pre = f.read().splitlines()

def no_accent(s):
    s = regex.sub(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', s)
    s = regex.sub(r'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', s)
    s = regex.sub(r'[èéẹẻẽêềếệểễ]', 'e', s)
    s = regex.sub(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = regex.sub(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', s)
    s = regex.sub(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', s)
    s = regex.sub(r'[ìíịỉĩ]', 'i', s)
    s = regex.sub(r'[ÌÍỊỈĨ]', 'I', s)
    s = regex.sub(r'[ùúụủũưừứựửữ]', 'u', s)
    s = regex.sub(r'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', s)
    s = regex.sub(r'[ỳýỵỷỹ]', 'y', s)
    s = regex.sub(r'[ỲÝỴỶỸ]', 'Y', s)
    s = regex.sub(r'[Đ]', 'D', s)
    s = regex.sub(r'[đ]', 'd', s)
    return s


stop_word = set()
for a in stop_word_pre:
  stop_word.add(a)
  stop_word.add(no_accent(a))

In [5]:
MULTI_DIRECT = '/sentiment_analysis_train.v1.0.txt'
VN_DATA_DIRECT = '/data_vn.txt'


def remove_character_not_ness(document):
    document = regex.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]', ' ', document)
    document = regex.sub(r'[^\D]', ' ', document)
    document = regex.sub(r'\s+', ' ', document).strip()
    return document

def remove_stopwords(document):
    words = []
    for word in document.strip().split():
        if word not in stop_word:
            words.append(word)
    return ' '.join(words)

def specify_preprocess(document):
    document = document.lower()
    document = document.replace('1*', ' một_sao ')
    document = document.replace('2*', ' hai_sao ')
    document = document.replace('3*', ' ba_sao ')
    document = document.replace('4*', ' bốn_sao ')
    document = document.replace('5*', ' năm_sao ')
    document = document.replace('1 *', ' một_sao ')
    document = document.replace('2 *', ' hai_sao ')
    document = document.replace('3 *', ' ba_sao ')
    document = document.replace('4 *', ' bốn_sao ')
    document = document.replace('5 *', ' năm_sao ')
    document = document.replace('1 sao', ' một_sao ')
    document = document.replace('2 sao', ' hai_sao ')
    document = document.replace('3 sao', ' ba_sao ')
    document = document.replace('4 sao', ' bốn_sao ')
    document = document.replace('5 sao', ' năm_sao ')
    document = document.replace('1sao', ' một_sao ')
    document = document.replace('2sao', ' hai_sao ')
    document = document.replace('3sao', ' ba_sao ')
    document = document.replace('4sao', ' bốn_sao ')
    document = document.replace('5sao', ' năm_sao ')
    document = document.replace('một sao', 'một_sao')
    document = document.replace('hai sao', 'hai_sao')
    document = document.replace('ba sao', 'ba_sao')
    document = document.replace('bốn sao', 'bốn_sao')
    document = document.replace('năm sao', 'năm_sao')
    return document

def text_preprocess(document):
    document = document.lower()
    document = remove_character_not_ness(document)
    document = word_tokenize(document, format="text")
    # document = remove_stopwords(document)
    return document


def load_training_data(
        directory
):
    text = []
    label = []
    with open(f"{directory}", encoding="utf8") as f:
        data_set = f.read().splitlines()
        for data in data_set:
            document = data
            tmp = document.split(" ", 1)
            if directory == VN_DATA_DIRECT:
              tmp[1] = specify_preprocess(tmp[1])
            tmp[1] = text_preprocess(tmp[1])
            label.append(tmp[0])
            text.append(tmp[1])
            s = no_accent(tmp[1]).replace('_', ' ')
            if tmp[1] != s:
                label.append(tmp[0])
                text.append(s)
    print("Text after processing:")
    print(*text[:10], sep='\n')
    return text, label

In [6]:
def train(text, label, predict_direct, test_direct):
    # vectorizer = TfidfVectorizer(ngram_range=(1, 10),max_df=0.8,use_idf=False, sublinear_tf=True)
    # text = vectorizer.fit_transform(text)
    X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.1, random_state=101)
    # over_sampler = RandomOverSampler(random_state=42)
    # X_train, y_train = over_sampler.fit_resample(X_train, y_train)
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train)
    y_train = label_encoder.transform(y_train)
    y_test = label_encoder.transform(y_test)

    start_time = time.time()
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 5),
                                                  max_df=0.8,
                                                  )),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LinearSVC())
                         ])
    text_clf = text_clf.fit(X_train, y_train)

    train_time = time.time() - start_time
    print('Done training SVM in', train_time, 'seconds.')

    y_pred = text_clf.predict(X_test)
    print('SVM, Accuracy =', np.mean(y_pred == y_test))

    y_pred = text_clf.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=list(label_encoder.classes_)))

    test = []
    with open(f"{test_direct}", encoding="utf8") as f:
      data_set = f.read().splitlines()
      for data in data_set:
        if predict_direct == VN_DIRECT_PREDICT:
           data = specify_preprocess(data)
        data = text_preprocess(data)
        test.append(data)

    label = text_clf.predict(test)
    predict = list(label_encoder.inverse_transform(label))
    print(predict[0:10])
    for text in predict:
      ff = open(f"{predict_direct}", "a", encoding="utf8")
      ff.write(text + '\n')
      ff.close()

In [7]:
MULTI_DIRECT_TEST = '/sentiment_analysis_test_unlabel.v1.0.txt'
VN_DIRECT_TEST = '/test_vn.txt'
MULTI_DIRECT_PREDICT = '/result_multi.txt'
VN_DIRECT_PREDICT = '/result_vn.txt'

In [8]:
train_text, train_label = load_training_data(MULTI_DIRECT)
print("Multilingual test")
train(train_text, train_label, MULTI_DIRECT_PREDICT, MULTI_DIRECT_TEST)

Text after processing:
good ratio price service good advices for the national_park and_village tốt
good ratio price service good advices for the national park and village tot
trang_thiết_bị vệ_sinh hơi cũ vệ_sinh sạch_sẽ nhân_viên thân_thiện địa_điểm tốt một khách_sạn sạch_sẽ thoải_mái dễ_chịu địa_điểm tốt
trang thiet bi ve sinh hoi cu ve sinh sach se nhan vien than thien dia diem tot mot khach san sach se thoai mai de chiu dia diem tot
friendly staff helpful for booking tours room was_clean and_quiet perfect location for ke_bang national park caves
friendly staff helpful for booking tours room was clean and quiet perfect location for ke bang national park caves
quá ồn_ào thất_vọng
qua on ao that vong
nice place to be to get around staff really takes care of_you perfect stay super staff
nice place to be to get around staff really takes care of you perfect stay super staff
Multilingual test
Done training SVM in 52.40573000907898 seconds.
SVM, Accuracy = 0.7350101055468223
              

In [9]:
train_text, train_label = load_training_data(VN_DATA_DIRECT)
print("After translate")
train(train_text, train_label, VN_DIRECT_PREDICT, VN_DIRECT_TEST)

Text after processing:
tỷ_lệ tốt giá dịch_vụ lời khuyên tốt cho vườn_quốc_gia và làng tốt
ty le tot gia dich vu loi khuyen tot cho vuon quoc gia va lang tot
trang_thiết_bị vệ_sinh hơi cũ vệ_sinh sạch_sẽ nhân_viên thân_thiện địa_điểm tốt một khách_sạn sạch_sẽ thoải_mái dễ_chịu địa_điểm tốt
trang thiet bi ve sinh hoi cu ve sinh sach se nhan vien than thien dia diem tot mot khach san sach se thoai mai de chiu dia diem tot
nhân_viên thân_thiện hữu_ích để đặt tour du_lịch phòng sạch_sẽ và yên_tĩnh vị_trí hoàn_hảo cho hang_động công_viên quốc_gia ke_bang
nhan vien than thien huu ich de dat tour du lich phong sach se va yen tinh vi tri hoan hao cho hang dong cong vien quoc gia ke bang
quá ồn_ào thất_vọng
qua on ao that vong
nơi tốt_đẹp để có được xung_quanh nhân_viên thực_sự chăm_sóc bạn ở lại hoàn_hảo siêu_nhân_viên
noi tot dep de co duoc xung quanh nhan vien thuc su cham soc ban o lai hoan hao sieu nhan vien
After translate
Done training SVM in 57.52921533584595 seconds.
SVM, Accuracy = 0.6