In [376]:
# Thống kê số lượng data theo nhãn
count = {}
for line in open('texts_categories.txt', encoding="utf8"):
    key = line.split()[0]
    count[key] = count.get(key, 0) + 1

for key in count:
    print(key, count[key])

__label__thể_thao 200
__label__chấn_thương 200


In [377]:
# Thống kê các word xuất hiện ở tất cả các nhãn
total_label = 3
vocab = {}
label_vocab = {}
for line in open('texts_categories.txt', encoding="utf8"):
    words = line.split()
    # lưu ý từ đầu tiên là nhãn
    label = words[0]
    if label not in label_vocab:
        label_vocab[label] = {}
    for word in words[1:]:
        label_vocab[label][word] = label_vocab[label].get(word, 0) + 1
        if word not in vocab:
            vocab[word] = set()
        vocab[word].add(label)

count = {}
for word in vocab:
    if len(vocab[word]) == total_label:
        count[word] = min([label_vocab[x][word] for x in label_vocab])
        
sorted_count = sorted(count, key=count.get, reverse=True)
for word in sorted_count[:100]:
    print(word, count[word])

In [378]:
# loại stopword khỏi dữ liệu
# lưu file dùng về sau
stopword = set()
with open('stopwords.txt', 'w', encoding="utf8") as fp:
    for word in sorted_count[:100]:
        stopword.add(word)
        fp.write(word + '\n')
    
def remove_stopwords(line):
    words = []
    for word in line.strip().split():
        if word not in stopword:
            words.append(word)
    return ' '.join(words)
    
    
with open('texts_categories.prep', 'w', encoding="utf8") as fp:
    for line in open('texts_categories.txt', encoding="utf8"):
        line = remove_stopwords(line)
        fp.write(line + '\n')

In [379]:
# Chia tập train/test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
test_percent = 0.2

text = []
label = []

for line in open('texts_categories.prep', encoding="utf8"):
    words = line.strip().split()
    label.append(words[0])
    text.append(' '.join(words[1:]))

X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=test_percent, random_state=42)

# Lưu train/test data
# Giữ nguyên train/test để về sau so sánh các mô hình cho công bằng
with open('train.txt', 'w', encoding="utf8") as fp:
    for x, y in zip(X_train, y_train):
        fp.write('{} {}\n'.format(y, x))

with open('test.txt', 'w', encoding="utf8") as fp:
    for x, y in zip(X_test, y_test):
        fp.write('{} {}\n'.format(y, x))

# encode label
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print(list(label_encoder.classes_), '\n')
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

print(X_train[1], y_train[1], '\n')
print(X_test[0], y_test[0])

['__label__chấn_thương', '__label__thể_thao'] 

chị hằng nga mẹ bé cho biết trong quá_trình tập_luyện và thi_đấu không ít lần quang_minh gặp chấn_thương trên sân_cỏ tuy rất xót con nhưng khi bé khẳng_định mình không sao cả chị thấy vững_tin về ý_chí con đã trui_rèn được từ thể_thao tôi biết con đá mệt đôi lúc té trên sân rồi bước ra ngoài còn rất đau nhưng tôi vẫn phải vững_lòng để tiếp động_lực giúp con lấy lại tinh_thần thi_đấu chị cũng cảm_thấy vui khi so với tuổi quang_minh khá tự_lập trưởng_thành biết đặt mục_tiêu và kiên_trì chinh_phục ước_mơ theo bóng_đá chuyên_nghiệp trong tương_lai 1 

dù khi bị đứt dây_chằng đầu_gối người_bệnh vẫn có_thể đi_lại tuy_nhiên chân bị chấn_thương sẽ khó di_chuyển hơn không_thể uốn_cong và gập đầu_gối như bình_thường một_số người có_thể cảm_nhận tình_trạng lỏng_lẻo ở khớp gối ngoài_ra người_bệnh có_thể nghe thấy tiếng lách_cách hoặc lạo_xạo ở khu_vực bị_thương khớp gối lõm vào hoặc co_thắt cơ 0


In [380]:
MODEL_PATH = "models"

import os
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

In [381]:
import pickle
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(solver='lbfgs', 
                                                multi_class='auto',
                                                max_iter=10000))
                    ])
text_clf = text_clf.fit(X_train, y_train)
 
train_time = time.time() - start_time
print('Done training Logistic Regression in', train_time, 'seconds.')

# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "lr.pkl"), 'wb'))

Done training Logistic Regression in 0.2136993408203125 seconds.


In [382]:
from sklearn.neighbors import KNeighborsClassifier

start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', KNeighborsClassifier())
                    ])
text_clf = text_clf.fit(X_train, y_train)

train_time = time.time() - start_time
print('Done training K-Nearest Neighbors in', train_time, 'seconds.')

# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "knn.pkl"), 'wb'))

Done training K-Nearest Neighbors in 0.053968191146850586 seconds.


In [383]:
from sklearn.tree import DecisionTreeClassifier

start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', DecisionTreeClassifier())
                    ])
text_clf = text_clf.fit(X_train, y_train)

train_time = time.time() - start_time
print('Done training Decision Tree Classifier in', train_time, 'seconds.')

# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "dtc.pkl"), 'wb'))

Done training Decision Tree Classifier in 0.07795834541320801 seconds.


In [384]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', RandomForestClassifier())
                    ])
text_clf = text_clf.fit(X_train, y_train)

train_time = time.time() - start_time
print('Done training Random Forest Classifier in', train_time, 'seconds.')

# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "rfc.pkl"), 'wb'))

Done training Random Forest Classifier in 0.44156670570373535 seconds.


In [385]:
from sklearn.ensemble import AdaBoostClassifier

start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', AdaBoostClassifier())
                    ])
text_clf = text_clf.fit(X_train, y_train)

train_time = time.time() - start_time
print('Done training AdaBoost Classifier in', train_time, 'seconds.')

# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "abc.pkl"), 'wb'))

Done training AdaBoost Classifier in 0.39243125915527344 seconds.


In [386]:
from sklearn.naive_bayes import MultinomialNB

start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())
                    ])
text_clf = text_clf.fit(X_train, y_train)

train_time = time.time() - start_time
print('Done training Naive Bayes in', train_time, 'seconds.')

# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "nb.pkl"), 'wb'))

Done training Naive Bayes in 0.15761470794677734 seconds.


In [387]:
from sklearn.svm import SVC

start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC(gamma='scale'))
                    ])
text_clf = text_clf.fit(X_train, y_train)

train_time = time.time() - start_time
print('Done training SVM in', train_time, 'seconds.')

# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "svm.pkl"), 'wb'))

Done training SVM in 0.14736008644104004 seconds.


In [388]:
import numpy as np
# SVM
model = pickle.load(open(os.path.join(MODEL_PATH,"svm.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('SVM, Accuracy =', np.mean(y_pred == y_test))

# Logistic Regression
model = pickle.load(open(os.path.join(MODEL_PATH,"lr.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('Logistic Regression, Accuracy =', np.mean(y_pred == y_test))

# Random Forest Classifier
model = pickle.load(open(os.path.join(MODEL_PATH,"rfc.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('Random Forest Classifier, Accuracy =', np.mean(y_pred == y_test))

# K-Nearest Neighbors
model = pickle.load(open(os.path.join(MODEL_PATH,"knn.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('K-Nearest Neighbors, Accuracy =', np.mean(y_pred == y_test))

# Naive Bayes
model = pickle.load(open(os.path.join(MODEL_PATH,"nb.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('Naive Bayes, Accuracy =', np.mean(y_pred == y_test))

# AdaBoost Classifier
model = pickle.load(open(os.path.join(MODEL_PATH,"abc.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('AdaBoost Classifier, Accuracy =', np.mean(y_pred == y_test))

# Decision Tree Classifier
model = pickle.load(open(os.path.join(MODEL_PATH,"dtc.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('Decision Tree Classifier, Accuracy =', np.mean(y_pred == y_test))

SVM, Accuracy = 0.9875
Logistic Regression, Accuracy = 0.975
Random Forest Classifier, Accuracy = 0.975
K-Nearest Neighbors, Accuracy = 0.9625
Naive Bayes, Accuracy = 0.9625
AdaBoost Classifier, Accuracy = 0.9125
Decision Tree Classifier, Accuracy = 0.8875


In [389]:
# Xem kết quả trên từng nhãn
from sklearn.metrics import classification_report

print("Classification Report of SVM:")
svm_model = pickle.load(open(os.path.join(MODEL_PATH,"svm.pkl"), 'rb'))
y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm, target_names=list(label_encoder.classes_)))

print("Classification Report of LR:")
lr_model = pickle.load(open(os.path.join(MODEL_PATH,"lr.pkl"), 'rb'))
y_pred_lr = lr_model.predict(X_test)
print(classification_report(y_test, y_pred_lr, target_names=list(label_encoder.classes_)))

print("Classification Report of RFC:")
rfc_model = pickle.load(open(os.path.join(MODEL_PATH,"rfc.pkl"), 'rb'))
y_pred_rfc = rfc_model.predict(X_test)
print(classification_report(y_test, y_pred_rfc, target_names=list(label_encoder.classes_)))

print("Classification Report of KNN:")
knn_model = pickle.load(open(os.path.join(MODEL_PATH,"knn.pkl"), 'rb'))
y_pred_knn = knn_model.predict(X_test)
print(classification_report(y_test, y_pred_knn, target_names=list(label_encoder.classes_)))

print("Classification Report of NB:")
nb_model = pickle.load(open(os.path.join(MODEL_PATH,"nb.pkl"), 'rb'))
y_pred_nb = nb_model.predict(X_test)
print(classification_report(y_test, y_pred_nb, target_names=list(label_encoder.classes_)))

print("Classification Report of ABC:")
abc_model = pickle.load(open(os.path.join(MODEL_PATH,"abc.pkl"), 'rb'))
y_pred_abc = abc_model.predict(X_test)
print(classification_report(y_test, y_pred_abc, target_names=list(label_encoder.classes_)))

print("Classification Report of DTC:")
dtc_model = pickle.load(open(os.path.join(MODEL_PATH,"dtc.pkl"), 'rb'))
y_pred_dtc = dtc_model.predict(X_test)
print(classification_report(y_test, y_pred_dtc, target_names=list(label_encoder.classes_)))

Classification Report of SVM:
                      precision    recall  f1-score   support

__label__chấn_thương       0.97      1.00      0.99        36
   __label__thể_thao       1.00      0.98      0.99        44

            accuracy                           0.99        80
           macro avg       0.99      0.99      0.99        80
        weighted avg       0.99      0.99      0.99        80

Classification Report of LR:
                      precision    recall  f1-score   support

__label__chấn_thương       0.97      0.97      0.97        36
   __label__thể_thao       0.98      0.98      0.98        44

            accuracy                           0.97        80
           macro avg       0.97      0.97      0.97        80
        weighted avg       0.97      0.97      0.97        80

Classification Report of RFC:
                      precision    recall  f1-score   support

__label__chấn_thương       0.95      1.00      0.97        36
   __label__thể_thao       1.00     

In [390]:
def text_preprocess(document):
    # xóa html code
    document = remove_html(document)
    # chuẩn hóa unicode
    document = convert_unicode(document)
    # chuẩn hóa cách gõ dấu tiếng Việt
    document = chuan_hoa_dau_cau_tieng_viet(document)
    # tách từ
    document = word_tokenize(document, format="text")
    # đưa về lower
    document = document.lower()
    # xóa các ký tự không cần thiết
    document = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',' ',document)
    # xóa khoảng trắng thừa
    document = re.sub(r'\s+', ' ', document).strip()
    return document

In [391]:
# Cài đặt một số hàm tiền xử lý văn bản cần thiết
!pip3 install --user underthesea
import regex as re
from underthesea import word_tokenize
 
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"
 
def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic
dicchar = loaddicchar()

# Hàm chuyển Unicode dựng sẵn về Unicde tổ hợp (phổ biến hơn)
def convert_unicode(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}

for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)

def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word

    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)
    if len(nguyen_am_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = nguyen_am_to_ids.get(chars[1])
                chars[1] = bang_nguyen_am[x][dau_cau]
            else:
                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = bang_nguyen_am[x][dau_cau]
                else:
                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
            return ''.join(chars)
        return word

    for index in nguyen_am_index:
        x, y = nguyen_am_to_ids[chars[index]]
        if x == 4 or x == 8:  # ê, ơ
            chars[index] = bang_nguyen_am[x][dau_cau]
            # for index2 in nguyen_am_index:
            #     if index2 != index:
            #         x, y = nguyen_am_to_ids[chars[index]]
            #         chars[index2] = bang_nguyen_am[x][0]
            return ''.join(chars)

    if len(nguyen_am_index) == 2:
        if nguyen_am_index[-1] == len(chars) - 1:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
        else:
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    else:
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
        # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
        # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
    return ''.join(chars)


def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True


def chuan_hoa_dau_cau_tieng_viet(sentence):
    """
        Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
        :param sentence:
        :return:
        """
    sentence = sentence.lower()
    words = sentence.split()
    for index, word in enumerate(words):
        cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/')
        # print(cw)
        if len(cw) == 3:
            cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1])
        words[index] = ''.join(cw)
    return ' '.join(words)

def remove_html(txt):
    return re.sub(r'<[^>]*>', '', txt)



In [392]:
def preprocessing_before_predict(document):
    document = text_preprocess(document)
    document = remove_stopwords(document)
    return document

def predict_label_lr(document,pred_document):
    label = lr_model.predict([pred_document])
    print('With LR:', label_encoder.inverse_transform(label))

def predict_label_knn(document,pred_document):
    label = knn_model.predict([pred_document])
    print('With KNN:', label_encoder.inverse_transform(label))

def predict_label_dtc(document,pred_document):
    label = dtc_model.predict([pred_document])
    print('With DTC:', label_encoder.inverse_transform(label))

def predict_label_rfc(document,pred_document):
    label = rfc_model.predict([pred_document])
    print('With RFC:', label_encoder.inverse_transform(label))

def predict_label_abc(document,pred_document):
    label = abc_model.predict([pred_document])
    print('With ABC:', label_encoder.inverse_transform(label))

def predict_label_nb(document,pred_document):
    label = nb_model.predict([pred_document])
    print('With NB:', label_encoder.inverse_transform(label))

def predict_label_svm(document,pred_document):
    label = svm_model.predict([pred_document])
    print('With SVM:', label_encoder.inverse_transform(label))

documents = []
documents.append("Trong thể thao, chấn thương là chuyện không thể tránh khỏi của vận động viên")
documents.append("Nghĩa đá bóng và chạy bộ rất thường xuyên")
documents.append("Nghĩa bị chấn thương khi tập luyện thể thao quá mức")
for i in documents:
    print('Predict label for "',i,'":')
    j = preprocessing_before_predict(i)
    predict_label_svm(i,j)
    predict_label_lr(i,j)
    predict_label_rfc(i,j)
    predict_label_knn(i,j)
    predict_label_nb(i,j)
    predict_label_abc(i,j)
    predict_label_dtc(i,j)
    print('\n')

Predict label for " Trong thể thao, chấn thương là chuyện không thể tránh khỏi của vận động viên ":
With SVM: ['__label__chấn_thương']
With LR: ['__label__chấn_thương']
With RFC: ['__label__chấn_thương']
With KNN: ['__label__chấn_thương']
With NB: ['__label__chấn_thương']
With ABC: ['__label__chấn_thương']
With DTC: ['__label__chấn_thương']


Predict label for " Nghĩa đá bóng và chạy bộ rất thường xuyên ":
With SVM: ['__label__thể_thao']
With LR: ['__label__thể_thao']
With RFC: ['__label__thể_thao']
With KNN: ['__label__chấn_thương']
With NB: ['__label__thể_thao']
With ABC: ['__label__thể_thao']
With DTC: ['__label__thể_thao']


Predict label for " Nghĩa bị chấn thương khi tập luyện thể thao quá mức ":
With SVM: ['__label__chấn_thương']
With LR: ['__label__chấn_thương']
With RFC: ['__label__chấn_thương']
With KNN: ['__label__chấn_thương']
With NB: ['__label__chấn_thương']
With ABC: ['__label__chấn_thương']
With DTC: ['__label__chấn_thương']


