# 1. Load the data preprocessed 

Some useable techniques for Vietnamese:
- lower casing
- punctuation removal
- stopwords removal
- frequent words removal

In [2]:
# convert data preprocessed to scr_matrix scipy
# each feature have the same labels file y_data, y_test
import numpy as np
from scipy.sparse import save_npz, load_npz

from sklearn.model_selection import train_test_split

# save and load model
from joblib import dump, load
# if-idf features
import sklearn.naive_bayes as naive_bayes
import sklearn.metrics as metrics
import pickle

from tqdm.notebook import tqdm 

In [3]:
def saveData(data_name):
    # data_name: a string
    data = np.load('%s.npy' %data_name)
    save_npz('%s.npz'%data_name, data.item())

# training data
    # count features
saveData('X_data_count')
    # tf-idf features
saveData('X_data_tfidf')
saveData('X_data_tfidf_ngram')
saveData('X_data_tfidf_ngram_char')

# testing data
    # count features
saveData('X_test_count')
    # tf-idf features
saveData('X_test_tfidf')
saveData('X_test_tfidf_ngram')
saveData('X_test_tfidf_ngram_char')

FileNotFoundError: [Errno 2] No such file or directory: 'X_data_count.npy'

In [None]:
# tf-idf with reduced dimensions is numpy type so don't need to convert to scipy data type

# 2. Model

In [4]:
# training the model
def trainModel(classifier, X_data, y_data, X_test, y_test, is_neuralnet=False, n_epochs=5):       
    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.3, random_state=42)
    
    if is_neuralnet:
        classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=n_epochs, batch_size=512)
        
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)
        val_predictions = val_predictions.argmax(axis=-1)
        test_predictions = test_predictions.argmax(axis=-1)
    else:
        classifier.fit(X_train, y_train)
    
        train_predictions = classifier.predict(X_train)
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)
        
    print("Validation accuracy: ", metrics.accuracy_score(val_predictions, y_val))
    print("Test accuracy: ", metrics.accuracy_score(test_predictions, y_test))
    return (classifier, X_train, X_val, y_train, y_val)

In [49]:
# saving trained model
def saveModel(model, model_name):
    dump(model, '%s.joblib' %model_name)

In [7]:
# training data
X_data_tfidf = load_npz("data_compressed/X_data_tfidf.npz")
y_data = pickle.load(open('VNTC_data/y_data.pkl', 'rb'))
# testing data
X_test_tfidf = load_npz("data_compressed/X_test_tfidf.npz")
y_test = pickle.load(open('VNTC_data/y_test.pkl', 'rb'))

In [8]:
# train the model
(tfidf_MultinomialNB, X_train, X_val, y_train, y_val) = trainModel(naive_bayes.MultinomialNB(), X_data_tfidf, y_data, X_test_tfidf, y_test, is_neuralnet=False)

Validation accuracy:  0.8531793048973144
Test accuracy:  0.8509121950251126


In [52]:
dump(tfidf_MultinomialNB, 'tfidf_MultinomialNB.joblib')

['tfidf_MultinomialNB.joblib']

In [53]:
tfidf_MultinomialNB = load('tfidf_MultinomialNB.joblib')

In [54]:
# stats about data used for training
num_classes = 10
for i in range(num_classes):
    print(tfidf_MultinomialNB.classes_[i], ":", tfidf_MultinomialNB.class_count_[i])
print("Total:", sum(tfidf_MultinomialNB.class_count_))

Chinh tri Xa hoi : 3671.0
Doi song : 2185.0
Khoa hoc : 1278.0
Kinh doanh : 1753.0
Phap luat : 2731.0
Suc khoe : 2368.0
The gioi : 2044.0
The thao : 3724.0
Van hoa : 2117.0
Vi tinh : 1760.0
Total: 23631.0


In [55]:
y_train_labels = tfidf_MultinomialNB.predict(X_train)

In [56]:
for i in set(sth):
    print(i, sth.tolist().count(i))

The thao 4206
Doi song 2682
The gioi 2066
Vi tinh 1813
Kinh doanh 1573
Khoa hoc 708
Suc khoe 2936
Chinh tri Xa hoi 5662
Van hoa 2367
Phap luat 2994


In [57]:
y_data_predicted = tfidf_MultinomialNB.predict(X_train)
print(metrics.classification_report(y_train, y_data_predicted))

                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.67      0.94      0.78      3671
        Doi song       0.84      0.89      0.86      2185
        Khoa hoc       0.99      0.45      0.62      1278
      Kinh doanh       0.95      0.70      0.80      1753
       Phap luat       0.94      0.90      0.92      2731
        Suc khoe       0.87      0.93      0.90      2368
        The gioi       0.96      0.85      0.90      2044
        The thao       0.99      0.98      0.98      3724
         Van hoa       0.94      0.91      0.92      2117
         Vi tinh       0.96      0.87      0.91      1760

       micro avg       0.87      0.87      0.87     23631
       macro avg       0.91      0.84      0.86     23631
    weighted avg       0.90      0.87      0.87     23631



In [58]:
y_data_predicted = tfidf_MultinomialNB.predict(X_test_tfidf)
print(metrics.classification_report(y_test, y_data_predicted))

                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.62      0.96      0.75      7567
        Doi song       0.68      0.68      0.68      2036
        Khoa hoc       0.98      0.33      0.49      2096
      Kinh doanh       0.96      0.65      0.78      5276
       Phap luat       0.89      0.87      0.88      3788
        Suc khoe       0.88      0.93      0.90      5417
        The gioi       0.96      0.85      0.90      6716
        The thao       0.97      0.97      0.97      6667
         Van hoa       0.92      0.91      0.92      6250
         Vi tinh       0.95      0.85      0.90      4560

       micro avg       0.85      0.85      0.85     50373
       macro avg       0.88      0.80      0.82     50373
    weighted avg       0.88      0.85      0.85     50373



In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Load the data (python list)
X_data = pickle.load(open('VNTC_data/X_data.pkl', 'rb'))
y_data = pickle.load(open('VNTC_data/y_data.pkl', 'rb'))

X_test = pickle.load(open('VNTC_data/X_test.pkl', 'rb'))
y_test = pickle.load(open('VNTC_data/y_test.pkl', 'rb'))

# word level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(X_data) # learn vocabulary and idf from training set
X_data_tfidf =  tfidf_vect.transform(X_data)
# assume that we don't have test set before
X_test_tfidf =  tfidf_vect.transform(X_test)

In [64]:
with open('tfidf_vect_feature_names.txt', 'w', encoding='utf-8') as feature_file:
    for i in tfidf_vect.get_feature_names():
        feature_file.write('%s \n' %i)
feature_file.close()

In [2]:
with open('vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as stop_words_file:
    stop_words = stop_words_file.readlines()
    stop_words_file.close()
with open('tfidf_vect_feature_names.txt', 'r', encoding='utf-8') as feature_file:
    feature_names = feature_file.readlines()
    feature_file.close()

In [11]:
count = 0
for i in stop_words:
    for j in feature_names:
        if i.rstrip().strip() == j.rstrip().strip():
            count += 1
print(count)

670


In [12]:
print(len(feature_names))

30000


In [13]:
print(len(stop_words))

1942


In [15]:

# Load the data (python list)
X_data = pickle.load(open('VNTC_data/X_data.pkl', 'rb'))
y_data = pickle.load(open('VNTC_data/y_data.pkl', 'rb'))

X_test = pickle.load(open('VNTC_data/X_test.pkl', 'rb'))
y_test = pickle.load(open('VNTC_data/y_test.pkl', 'rb'))

In [21]:
# remove stopwords

def removeStopwords(data):
    # import stopwords
    with open('vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as stop_words_file:
        stop_words = stop_words_file.readlines()
        stop_words_file.close()
    # remove words
    new_data = []
    for text in tqdm(data):
        temp_data = text
        for word in stop_words:
            temp_data = temp_data.replace(word.rstrip().strip(), '')
        new_data.append(temp_data)
    return new_data

In [22]:
X_data_stopwords = removeStopwords(X_data)

HBox(children=(FloatProgress(value=0.0, max=33759.0), HTML(value='')))




In [23]:
X_test_stopwords = removeStopwords(X_test)

HBox(children=(FloatProgress(value=0.0, max=50373.0), HTML(value='')))




In [25]:
stop_words = []
with open('vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as stop_words_file:
    words = stop_words_file.readlines()
    for word in words:
        stop_words.append(word.rstrip().strip())
    stop_words_file.close()
print(stop_words)

['a_lô', 'a_ha', 'ai', 'ai_ai', 'ai_nấy', 'ai_đó', 'alô', 'amen', 'anh', 'anh_ấy', 'ba', 'ba_ba', 'ba_bản', 'ba_cùng', 'ba_họ', 'ba_ngày', 'ba_ngôi', 'ba_tăng', 'bao_giờ', 'bao_lâu', 'bao_nhiêu', 'bao_nả', 'bay_biến', 'biết', 'biết_bao', 'biết_bao_nhiêu', 'biết_chắc', 'biết_chừng_nào', 'biết_mình', 'biết_mấy', 'biết_thế', 'biết_trước', 'biết_việc', 'biết_đâu', 'biết_đâu_chừng', 'biết_đâu_đấy', 'biết_được', 'buổi', 'buổi_làm', 'buổi_mới', 'buổi_ngày', 'buổi_sớm', 'bà', 'bà_ấy', 'bài', 'bài_bác', 'bài_bỏ', 'bài_cái', 'bác', 'bán', 'bán_cấp', 'bán_dạ', 'bán_thế', 'bây_bẩy', 'bây_chừ', 'bây_giờ', 'bây_nhiêu', 'bèn', 'béng', 'bên', 'bên_bị', 'bên_có', 'bên_cạnh', 'bông', 'bước', 'bước_khỏi', 'bước_tới', 'bước_đi', 'bạn', 'bản', 'bản_bộ', 'bản_riêng', 'bản_thân', 'bản_ý', 'bất_chợt', 'bất_cứ', 'bất_giác', 'bất_kì', 'bất_kể', 'bất_kỳ', 'bất_luận', 'bất_ngờ', 'bất_nhược', 'bất_quá', 'bất_quá_chỉ', 'bất_thình_lình', 'bất_tử', 'bất_đồ', 'bấy', 'bấy_chầy', 'bấy_chừ', 'bấy_giờ', 'bấy_lâu', 'bấy_lâ

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(lowercase=False, analyzer='word', 
                             stop_words=stop_words, max_features=10000)

In [49]:
tfidf_vect.fit(X_data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['a_lô', 'a_ha', 'ai', 'ai_ai', 'ai_nấy', 'ai_đó', 'alô', 'amen', 'anh', 'anh_ấy', 'ba', 'ba_ba', 'ba_bản', 'ba_cùng', 'ba_họ', 'ba_ngày', 'ba_ngôi', 'ba_tăng', 'bao_giờ', 'bao_lâu', 'bao_nhiêu', 'bao_nả', 'bay_biến', 'biết', 'biết_bao', 'biết_bao_nhiêu', 'biết_chắc', 'biết_chừng_nào', 'b... 'ở_vào', 'ở_đây', 'ở_đó', 'ở_được', 'ủa', 'ứ_hự', 'ứ_ừ', 'ừ', 'ừ_nhé', 'ừ_thì', 'ừ_ào', 'ừ_ừ', 'ử'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [51]:
X_data_tfidf = tfidf_vect.transform(X_data)
X_test_tfidf = tfidf_vect.transform(X_test)

In [52]:
# train the model
(tfidf_MultinomialNB, X_train, X_val, y_train, y_val) = trainModel(naive_bayes.MultinomialNB(), X_data_tfidf, y_data, X_test_tfidf, y_test, is_neuralnet=False)

Validation accuracy:  0.8864533965244866
Test accuracy:  0.8881940722212296


In [66]:
def investigateModel(max_features):
    print('Max Features:', max_features)
    tfidf_vect = TfidfVectorizer(lowercase=False, analyzer='word', 
                                 stop_words=stop_words, max_features=max_features)
    tfidf_vect.fit(X_data)

    X_data_tfidf = tfidf_vect.transform(X_data)
    X_test_tfidf = tfidf_vect.transform(X_test)
    # train the model
    (tfidf_MultinomialNB, X_train, X_val, y_train, y_val) = trainModel(naive_bayes.MultinomialNB(), 
                                                                       X_data_tfidf, y_data, X_test_tfidf, y_test, 
                                                                       is_neuralnet=False)
    return (tfidf_MultinomialNB, X_data_tfidf, X_test_tfidf)

In [58]:
for i in [1000, 2000, 5000, 10000, 15000, 20000, 30000]:
    investigateModel(i)
    print('\n')

Max Features: 1000
Validation accuracy:  0.8548578199052133
Test accuracy:  0.8630020050423838


Max Features: 2000
Validation accuracy:  0.8746050552922591
Test accuracy:  0.8802930141147043


Max Features: 5000
Validation accuracy:  0.8863546603475514
Test accuracy:  0.8895042979373871


Max Features: 10000
Validation accuracy:  0.8864533965244866
Test accuracy:  0.8881940722212296


Max Features: 15000
Validation accuracy:  0.8815165876777251
Test accuracy:  0.8817223512596034


Max Features: 20000
Validation accuracy:  0.8749012638230648
Test accuracy:  0.8753895936315089


Max Features: 30000
Validation accuracy:  0.8636453396524486
Test accuracy:  0.8640144521866873




In [59]:
for i in [3000, 4000, 6000, 7000, 8000, 9000]:
    investigateModel(i)
    print('\n')

Max Features: 3000
Validation accuracy:  0.8814178515007899
Test accuracy:  0.8855934726937049


Max Features: 4000
Validation accuracy:  0.8843799368088467
Test accuracy:  0.8879161455541659


Max Features: 6000
Validation accuracy:  0.8878357030015798
Test accuracy:  0.8899013360331924


Max Features: 7000
Validation accuracy:  0.8867496050552922
Test accuracy:  0.8902388184146269


Max Features: 8000
Validation accuracy:  0.8874407582938388
Test accuracy:  0.889802076509241


Max Features: 9000
Validation accuracy:  0.8872432859399684
Test accuracy:  0.8889087407936792




In [67]:
(tfidf_MultinomialNB_6000, X_data_tfidf, X_test_tfidf) = investigateModel(6000)

Max Features: 6000
Validation accuracy:  0.8878357030015798
Test accuracy:  0.8899013360331924


In [69]:
# training data
y_data_predicted = tfidf_MultinomialNB_6000.predict(X_data_tfidf)
print(metrics.classification_report(y_data, y_data_predicted))

                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.80      0.86      0.83      5219
        Doi song       0.85      0.88      0.86      3159
        Khoa hoc       0.92      0.69      0.79      1820
      Kinh doanh       0.87      0.86      0.87      2552
       Phap luat       0.91      0.92      0.91      3868
        Suc khoe       0.88      0.94      0.91      3384
        The gioi       0.93      0.88      0.91      2898
        The thao       0.99      0.96      0.98      5298
         Van hoa       0.91      0.92      0.92      3080
         Vi tinh       0.93      0.93      0.93      2481

       micro avg       0.90      0.90      0.90     33759
       macro avg       0.90      0.88      0.89     33759
    weighted avg       0.90      0.90      0.90     33759



In [70]:
# validation data
y_test_predicted = tfidf_MultinomialNB_6000.predict(X_test_tfidf)
print(metrics.classification_report(y_test, y_test_predicted))

                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.79      0.89      0.84      7567
        Doi song       0.71      0.71      0.71      2036
        Khoa hoc       0.88      0.58      0.70      2096
      Kinh doanh       0.92      0.84      0.88      5276
       Phap luat       0.85      0.90      0.88      3788
        Suc khoe       0.89      0.95      0.92      5417
        The gioi       0.94      0.89      0.92      6716
        The thao       0.99      0.96      0.97      6667
         Van hoa       0.91      0.93      0.92      6250
         Vi tinh       0.92      0.93      0.92      4560

       micro avg       0.89      0.89      0.89     50373
       macro avg       0.88      0.86      0.87     50373
    weighted avg       0.89      0.89      0.89     50373



In [81]:
import sklearn.linear_model as linear_model

def investigateModel(max_features, classifier):
    print('Max Features:', max_features)
    tfidf_vect = TfidfVectorizer(lowercase=False, analyzer='word', 
                                 stop_words=stop_words, max_features=max_features)
    tfidf_vect.fit(X_data)

    X_data_tfidf = tfidf_vect.transform(X_data)
    X_test_tfidf = tfidf_vect.transform(X_test)
    # train the model
    (trained_model, X_train, X_val, y_train, y_val) = trainModel(classifier, 
                                                                       X_data_tfidf, y_data, X_test_tfidf, y_test, 
                                                                       is_neuralnet=False)
    # statistical report 
        # training data
    y_data_predicted = trained_model.predict(X_data_tfidf)
    print(metrics.classification_report(y_data, y_data_predicted))
        # validation data
    y_test_predicted = trained_model.predict(X_test_tfidf)
    print(metrics.classification_report(y_test, y_test_predicted)) 
    return (tfidf_vect, classifier)

In [79]:
for i in [3000, 5000, 7000, 9000]:
    investigateModel(max_features=i, classifier=linear_model.LogisticRegression())

Max Features: 3000




Validation accuracy:  0.9031398104265402
Test accuracy:  0.9078673098683818
                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.85      0.90      0.88      5219
        Doi song       0.89      0.89      0.89      3159
        Khoa hoc       0.92      0.82      0.87      1820
      Kinh doanh       0.91      0.90      0.91      2552
       Phap luat       0.94      0.92      0.93      3868
        Suc khoe       0.91      0.95      0.93      3384
        The gioi       0.94      0.91      0.93      2898
        The thao       0.99      0.98      0.99      5298
         Van hoa       0.95      0.93      0.94      3080
         Vi tinh       0.94      0.95      0.95      2481

       micro avg       0.92      0.92      0.92     33759
       macro avg       0.92      0.92      0.92     33759
    weighted avg       0.92      0.92      0.92     33759

                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.83      0.91      0.87   

In [80]:
for i in [10000, 15000, 20000]:
    investigateModel(max_features=i, classifier=linear_model.LogisticRegression())

Max Features: 10000
Validation accuracy:  0.9084715639810427
Test accuracy:  0.9136640660671391
                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.86      0.91      0.89      5219
        Doi song       0.90      0.90      0.90      3159
        Khoa hoc       0.94      0.84      0.89      1820
      Kinh doanh       0.93      0.91      0.92      2552
       Phap luat       0.95      0.93      0.94      3868
        Suc khoe       0.92      0.96      0.94      3384
        The gioi       0.95      0.92      0.94      2898
        The thao       0.99      0.99      0.99      5298
         Van hoa       0.95      0.95      0.95      3080
         Vi tinh       0.95      0.96      0.96      2481

       micro avg       0.93      0.93      0.93     33759
       macro avg       0.94      0.93      0.93     33759
    weighted avg       0.93      0.93      0.93     33759

                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.83   

In [82]:
tfidf_vect, logistic_regression_15000 = investigateModel(15000, classifier=linear_model.LogisticRegression())

Max Features: 15000




Validation accuracy:  0.9092614533965245
Test accuracy:  0.9136640660671391
                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.87      0.91      0.89      5219
        Doi song       0.90      0.90      0.90      3159
        Khoa hoc       0.94      0.84      0.89      1820
      Kinh doanh       0.93      0.91      0.92      2552
       Phap luat       0.95      0.93      0.94      3868
        Suc khoe       0.92      0.96      0.94      3384
        The gioi       0.96      0.93      0.94      2898
        The thao       0.99      0.99      0.99      5298
         Van hoa       0.96      0.95      0.95      3080
         Vi tinh       0.95      0.96      0.96      2481

       micro avg       0.93      0.93      0.93     33759
       macro avg       0.94      0.93      0.93     33759
    weighted avg       0.94      0.93      0.93     33759

                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.83      0.92      0.87   

In [84]:
print(tfidf_vect.get_feature_names())

['aa', 'aaron', 'ab', 'aba', 'abbas', 'abbey', 'abc', 'abd', 'abdel', 'abdul', 'abdullah', 'abramovich', 'abtc', 'abu', 'ac', 'acb', 'access', 'account', 'ace', 'aceh', 'acer', 'achilefu', 'acid', 'acid_amin', 'acid_béo', 'activex', 'ad', 'adam', 'adams', 'adb', 'add', 'addvote', 'adelaide', 'adidas', 'adn', 'adobe', 'adrian', 'adriano', 'adsl', 'adu', 'advanced', 'advocaat', 'ae', 'afc', 'aff', 'affleck', 'afghanistan', 'afp', 'afta', 'ag', 'agassi', 'agf', 'agnelli', 'agresto', 'agribank', 'agricole', 'agu', 'ah', 'ahmad', 'ahmadinejad', 'ahmed', 'ahn', 'ai_bảo', 'ai_lại', 'ai_ngờ', 'aids', 'aimar', 'air', 'airbus', 'aires', 'airlines', 'airways', 'ajax', 'ak', 'al', 'alain', 'alam', 'alan', 'alaska', 'alaves', 'albacete', 'albania', 'albelda', 'albert', 'alberto', 'album', 'alessandro', 'alessio', 'alex', 'alexander', 'alexandria', 'alexei', 'alfred', 'algeria', 'ali', 'alice', 'alkmaar', 'all', 'allah', 'allardyce', 'allawi', 'allen', 'almeida', 'almunia', 'alonso', 'alpha', 'alpi'