In [1]:
# using VNTC-data
# https://github.com/duyvuleo/VNTC

In [2]:
# tokenizer and POS tagger
from pyvi import ViTokenizer, ViPosTagger 
from underthesea import word_tokenize, pos_tag
# progress bar
from tqdm.notebook import tqdm 
# save and load sklearn models
from joblib import dump, load 
# simple processing of text: remove special characters, numberic characters
import gensim 
# path
import os 
# save raw data text file 
import pickle  
import numpy as np
import time

In [3]:
# tfidf features
from sklearn.feature_extraction.text import TfidfVectorizer
# data manipulation and models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier, SGDClassifier, Perceptron, PassiveAggressiveClassifier, LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# metrics
from sklearn import metrics
# save and load model
from joblib import dump, load

In [4]:
models = [RidgeClassifier(), SGDClassifier(), Perceptron(), PassiveAggressiveClassifier(), LogisticRegression() # linear 
         , BernoulliNB(), ComplementNB(), MultinomialNB()                                                       # naive bayes
         , LinearSVC()                                                                                          # SVM
         , RandomForestClassifier()                                                                             # ensemble
         , XGBClassifier()                                                                                      # boosting
         ]                                                                                          
model_names = ['RidgeClassifier', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier', 'LogisticRegression'
              , 'BernoulliNB', 'ComplementNB', 'MultinomialNB'
              , 'LinearSVC'
              , 'RandomForestClassifier'
              , 'XGBClassifier']

# Data Preprocessing

In [31]:
dir_path = 'VNTC_data'
def get_data(folder_path):
    X = [] 
    y = []
    dirs = os.listdir(folder_path)
    for path in tqdm(dirs):
        file_paths = os.listdir(os.path.join(folder_path, path))
        for file_path in tqdm(file_paths):
            with open(os.path.join(folder_path, path, file_path), 'r', encoding="utf-16") as f:
                lines = f.readlines()
                lines = ' '.join(lines)
                # remove some special characters
                lines = gensim.utils.simple_preprocess(lines)
                lines = ' '.join(lines)
                # tokenizer
                lines = ViTokenizer.tokenize(lines)
                # text data 
                X.append(lines)
                # labels
                y.append(path)
    return X, y

In [32]:
train_path = os.path.join(dir_path, 'Train_Full')
X_data, y_data = get_data(train_path)
# save raw training data
pickle.dump(X_data, open('VNTC_data/X_data.pkl', 'wb'))
pickle.dump(y_data, open('VNTC_data/y_data.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5219.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3159.0), HTML(value='')))

KeyboardInterrupt: 

In [5]:
test_path = os.path.join(dir_path, 'Test_Full')
X_test, y_test = get_data(test_path)
# save raw testing data
pickle.dump(X_test, open('VNTC_data/X_test.pkl', 'wb'))
pickle.dump(y_test, open('VNTC_data/y_test.pkl', 'wb'))

NameError: name 'dir_path' is not defined

# Feature Engineering

In [5]:
# Load the data (python list)
X_data = pickle.load(open('VNTC_data/X_data.pkl', 'rb'))
y_data = pickle.load(open('VNTC_data/y_data.pkl', 'rb'))
X_test = pickle.load(open('VNTC_data/X_test.pkl', 'rb'))
y_test = pickle.load(open('VNTC_data/y_test.pkl', 'rb'))

In [6]:
print(len(X_data))
print(len(X_test))
print(len(X_data + X_test))

33759
50373
84132


In [71]:
# import list of stopwords
stop_words = []
with open('vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as file:
    for line in file.readlines():
        stop_words.append(line.rstrip().strip())
print(len(stop_words))

1942


In [72]:
with open('stopwords_tfidf_50000.txt', 'r', encoding='utf-8') as file:
    for line in file.readlines():
        stop_words.append(line.rstrip().strip())
print(len(stop_words))

4044


In [73]:
def createVectorizerData(X_data, X_test, stop_words, ngram_range, max_features):
    tfidf_vect = TfidfVectorizer(lowercase=False, analyzer='word'
                                ,stop_words=stop_words, ngram_range=ngram_range
                                ,max_features=max_features)
    # later
    # norm
    # sublinear_tf
    # 
    tfidf_vect.fit(X_data)
    # transform 
    X_data_tfidf = tfidf_vect.transform(X_data)
    X_test_tfidf = tfidf_vect.transform(X_test)
    X = tfidf_vect.transform(X_data + X_test)
    return (tfidf_vect, X_data_tfidf, X_test_tfidf, X)

# Models

In [74]:
def trainModel(classifier, data, labels, train_data, train_labels, test_data, test_labels, stats_name, model_name):
    # split to training and validation data
    X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.2, random_state=21)
    
    # train the model
    classifier.fit(X_train, y_train)
    
    # build the stats report
        # on the training data
    train_pred = classifier.predict(train_data)
    train_clf_rp = metrics.classification_report(train_labels, train_pred)
    train_cfs_rp = metrics.confusion_matrix(train_labels, train_pred, labels=classifier.classes_)
        # on the testing data
    test_pred = classifier.predict(test_data)
    test_clf_rp = metrics.classification_report(test_labels, test_pred)
    test_cfs_rp = metrics.confusion_matrix(test_labels, test_pred, labels=classifier.classes_)
        # on the whole data
    pred = classifier.predict(data)
    clf_rp = metrics.classification_report(labels, pred)
    cfs_rp = metrics.confusion_matrix(labels, pred, labels=classifier.classes_)
        # save the report 
    with open('report//%s.txt' %stats_name, 'w', encoding='utf-8') as report_file:
        report_file.write('Training data:\n')
        report_file.write(train_clf_rp)
        report_file.write(np.array2string(train_cfs_rp, separator = ','))
        
        report_file.write('\n\nTesting data:\n')
        report_file.write(test_clf_rp)
        report_file.write(np.array2string(test_cfs_rp, separator = ','))
        
        report_file.write('\n\nWhole data:\n')
        report_file.write(clf_rp)
        report_file.write(np.array2string(cfs_rp, separator = ','))
        
        report_file.close()
    # save the model 
    dump(classifier, 'models//%s.joblib' %model_name)
    # print the accuracy (for choosing model)
    print('Testing data accuracy: ', metrics.accuracy_score(test_pred, test_labels))

Using tfidf unigram to choose the best model

In [13]:
(_, X_data_tfidf_11, X_test_tfidf_11, X_tfidf_11) = createVectorizerData(X_data, X_test, 
                                                                         stop_words, 
                                                                         ngram_range=(1, 1), 
                                                                         max_features=10000)

## Linear Models

In [47]:
for i, model in enumerate(linear_model):
    print(linear_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_11, labels = y_data + y_test
               , train_data = X_data_tfidf_11, train_labels = y_data
               , test_data = X_test_tfidf_11, test_labels = y_test
               , stats_name = '%s_tfidf_11' %linear_model_names[i]
               , model_name = '%s_tfidf_11' %linear_model_names[i])

RidgeClassifier
Testing data accuracy:  0.9112421336827269
SGDClassifier




Testing data accuracy:  0.9147162170210231
Perceptron




Testing data accuracy:  0.8914299327020427
PassiveAggressiveClassifier




Testing data accuracy:  0.9066166398665952
LogisticRegression




Testing data accuracy:  0.9152720703551506


## Naive Bayes

In [49]:
for i, model in enumerate(naive_bayes_model):
    print(naive_bayes_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_11, labels = y_data + y_test
               , train_data = X_data_tfidf_11, train_labels = y_data
               , test_data = X_test_tfidf_11, test_labels = y_test
               , stats_name = '%s_tfidf_11' %naive_bayes_model_names[i]
               , model_name = '%s_tfidf_11' %naive_bayes_model_names[i])

BernoulliNB
Testing data accuracy:  0.8687789093363508
ComplementNB
Testing data accuracy:  0.8810870903063149
MultinomialNB
Testing data accuracy:  0.8906358565104322


## Support Vector Machine

In [14]:
for i, model in enumerate(SVM_model):
    print(SVM_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_11, labels = y_data + y_test
               , train_data = X_data_tfidf_11, train_labels = y_data
               , test_data = X_test_tfidf_11, test_labels = y_test
               , stats_name = '%s_tfidf_11' %SVM_model_names[i]
               , model_name = '%s_tfidf_11' %SVM_model_names[i])

LinearSVC
Testing data accuracy:  0.9160264427371806


## Ensemble

In [16]:
for i, model in enumerate(ensemble_model):
    print(ensemble_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_11, labels = y_data + y_test
               , train_data = X_data_tfidf_11, train_labels = y_data
               , test_data = X_test_tfidf_11, test_labels = y_test
               , stats_name = '%s_tfidf_11' %ensemble_model_names[i]
               , model_name = '%s_tfidf_11' %ensemble_model_names[i])

RandomForestClassifier




Testing data accuracy:  0.8214519683163599


## Boosting Models

In [22]:
for i, model in enumerate(xgboost_model):
    print(xgboost_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_11, labels = y_data + y_test
               , train_data = X_data_tfidf_11, train_labels = y_data
               , test_data = X_test_tfidf_11, test_labels = y_test
               , stats_name = '%s_tfidf_11' %xgboost_model_names[i]
               , model_name = '%s_tfidf_11' %xgboost_model_names[i])

XGBClassifier
Testing data accuracy:  0.8963532050900284


# Finding features

Choose 2 models: logisticRegression - 0.9152720703551506, linearSVC - 0.9160264427371806. Testing 2 models with bigram

In [10]:
choosed_model = [LogisticRegression(), LinearSVC()]
choosed_model_names = ['LogisticRegression', 'LinearSVC']

In [14]:
(_, X_data_tfidf_22, X_test_tfidf_22, X_tfidf_22) = createVectorizerData(X_data, X_test, 
                                                                         stop_words, 
                                                                         ngram_range=(2, 2), 
                                                                         max_features=10000)

In [16]:
for i, model in enumerate(choosed_model):
    print(choosed_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_22, labels = y_data + y_test
               , train_data = X_data_tfidf_22, train_labels = y_data
               , test_data = X_test_tfidf_22, test_labels = y_test
               , stats_name = '%s_tfidf_22' %choosed_model_names[i]
               , model_name = '%s_tfidf_22' %choosed_model_names[i])

LogisticRegression




Testing data accuracy:  0.8516268635975622
LinearSVC
Testing data accuracy:  0.8472792964484942


Need more features because bigram have a lot of combinations.

In [17]:
(_, X_data_tfidf_22, X_test_tfidf_22, X_tfidf_22) = createVectorizerData(X_data, X_test, 
                                                                         stop_words, 
                                                                         ngram_range=(2, 2), 
                                                                         max_features=20000)

In [19]:
for i, model in enumerate(choosed_model):
    print(choosed_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_22, labels = y_data + y_test
               , train_data = X_data_tfidf_22, train_labels = y_data
               , test_data = X_test_tfidf_22, test_labels = y_test
               , stats_name = '%s_tfidf_22' %choosed_model_names[i]
               , model_name = '%s_tfidf_22' %choosed_model_names[i])

LogisticRegression
Testing data accuracy:  0.8626843745657395
LinearSVC
Testing data accuracy:  0.8625255593274175


In [20]:
(_, X_data_tfidf_22, X_test_tfidf_22, X_tfidf_22) = createVectorizerData(X_data, X_test, 
                                                                         stop_words, 
                                                                         ngram_range=(2, 2), 
                                                                         max_features=50000)

In [21]:
for i, model in enumerate(choosed_model):
    print(choosed_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_22, labels = y_data + y_test
               , train_data = X_data_tfidf_22, train_labels = y_data
               , test_data = X_test_tfidf_22, test_labels = y_test
               , stats_name = '%s_tfidf_22' %choosed_model_names[i]
               , model_name = '%s_tfidf_22' %choosed_model_names[i])

LogisticRegression
Testing data accuracy:  0.869612689337542
LinearSVC
Testing data accuracy:  0.8785063426835805


In [22]:
(_, X_data_tfidf_22, X_test_tfidf_22, X_tfidf_22) = createVectorizerData(X_data, X_test, 
                                                                         stop_words, 
                                                                         ngram_range=(2, 2), 
                                                                         max_features=70000)

In [23]:
for i, model in enumerate(choosed_model):
    print(choosed_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_22, labels = y_data + y_test
               , train_data = X_data_tfidf_22, train_labels = y_data
               , test_data = X_test_tfidf_22, test_labels = y_test
               , stats_name = '%s_tfidf_22' %choosed_model_names[i]
               , model_name = '%s_tfidf_22' %choosed_model_names[i])

LogisticRegression
Testing data accuracy:  0.8704067655291525
LinearSVC
Testing data accuracy:  0.8825561312607945


In [24]:
(_, X_data_tfidf_22, X_test_tfidf_22, X_tfidf_22) = createVectorizerData(X_data, X_test, 
                                                                         stop_words, 
                                                                         ngram_range=(2, 2), 
                                                                         max_features=100000)

In [25]:
for i, model in enumerate(choosed_model):
    print(choosed_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_22, labels = y_data + y_test
               , train_data = X_data_tfidf_22, train_labels = y_data
               , test_data = X_test_tfidf_22, test_labels = y_test
               , stats_name = '%s_tfidf_22' %choosed_model_names[i]
               , model_name = '%s_tfidf_22' %choosed_model_names[i])

LogisticRegression
Testing data accuracy:  0.8705457288626843
LinearSVC
Testing data accuracy:  0.8864272526948961


Combining unigram and bigram

In [26]:
(_, X_data_tfidf_12, X_test_tfidf_12, X_tfidf_12) = createVectorizerData(X_data, X_test, 
                                                                         stop_words, 
                                                                         ngram_range=(1, 2), 
                                                                         max_features=20000)

In [27]:
for i, model in enumerate(choosed_model):
    print(choosed_model_names[i])
    trainModel(classifier = model
               , data=X_tfidf_12, labels = y_data + y_test
               , train_data = X_data_tfidf_12, train_labels = y_data
               , test_data = X_test_tfidf_12, test_labels = y_test
               , stats_name = '%s_tfidf_12' %choosed_model_names[i]
               , model_name = '%s_tfidf_12' %choosed_model_names[i])

LogisticRegression
Testing data accuracy:  0.9155698489270045
LinearSVC
Testing data accuracy:  0.9178131141683045


Slightly improvements

In [62]:
def findMaxFeatures(max_features):
    print('\n Max Features: ', max_features)
    # build vectorizer object for tfidf features
    (vectorizer, X_data_tfidf_12, X_test_tfidf_12, X_tfidf_12) = createVectorizerData(X_data, X_test, 
                                                                                      stop_words, 
                                                                                      ngram_range=(1, 2), 
                                                                                      max_features=max_features)
    # save infomation about features for future opimization
    pickle.dump(vectorizer, open("features/vectorizer_tfidf_12_%d" %max_features, "wb"))
    # train choosed models and print accuracy on test set
    for i, model in enumerate(choosed_model):
        print(choosed_model_names[i])
        trainModel(classifier = model
                   , data=X_tfidf_12, labels = y_data + y_test
                   , train_data = X_data_tfidf_12, train_labels = y_data
                   , test_data = X_test_tfidf_12, test_labels = y_test
                   , stats_name = '%s_tfidf_12_%d' %(choosed_model_names[i], max_features)
                   , model_name = '%s_tfidf_12_%d' %(choosed_model_names[i], max_features))

In [12]:
for i in [10000, 30000, 50000, 70000, 90000]:
    findMaxFeatures(i)


 Max Features:  10000
LogisticRegression




Testing data accuracy:  0.915450737498263
LinearSVC
Testing data accuracy:  0.9159271832132293

 Max Features:  30000
LogisticRegression
Testing data accuracy:  0.9151728108311993
LinearSVC
Testing data accuracy:  0.9187461536934469

 Max Features:  50000
LogisticRegression
Testing data accuracy:  0.9153713298791019
LinearSVC
Testing data accuracy:  0.9190836360748814

 Max Features:  70000
LogisticRegression
Testing data accuracy:  0.9150934032120382
LinearSVC
Testing data accuracy:  0.9193814146467354

 Max Features:  90000
LogisticRegression
Testing data accuracy:  0.9149742917832966
LinearSVC
Testing data accuracy:  0.9197387489329601


    Choosing LinearSVC as main Model

In [14]:
for i in [100000, 200000, 30000]:
    findMaxFeatures(i)


 Max Features:  100000
Testing data accuracy:  0.9197784527425407

 Max Features:  200000
Testing data accuracy:  0.9201357870287654

 Max Features:  30000
Testing data accuracy:  0.9187461536934469


In [15]:
for i in [300000, 500000, 1000000]:
    findMaxFeatures(i)


 Max Features:  300000
Testing data accuracy:  0.9204931213149902

 Max Features:  500000
Testing data accuracy:  0.9205328251245707

 Max Features:  1000000
Testing data accuracy:  0.9210291227443274


In [23]:
findMaxFeatures(None) # mean maximum of the corpus


 Max Features:  None
Testing data accuracy:  0.9208703075060052


    some optimizations: collect more data, finding opimized value for number of features

In [106]:
def findMaxFeatures(max_features):
    print('\n Max Features: ', max_features)
    # build vectorizer object for tfidf features
    (vectorizer, X_data_tfidf_12, X_test_tfidf_12, X_tfidf_12) = createVectorizerData(X_data, X_test, 
                                                                                      stop_words=stop_words, 
                                                                                      ngram_range=(1, 2), 
                                                                                      max_features=max_features)
    # save infomation about features for future opimization
    pickle.dump(vectorizer, open("features/vectorizer_tfidf_12___{}".format(max_features), "wb"))
    # train choosed models and print accuracy on test set
    
    # l1 work better for spare data and l2 work better for non-spare cases
    trainModel(classifier = LinearSVC(penalty='l1', loss='l2', dual=False, max_iter=2000, tol=1e-05, class_weight='balanced')
                , data=X_tfidf_12, labels = y_data + y_test
                , train_data = X_data_tfidf_12, train_labels = y_data
                , test_data = X_test_tfidf_12, test_labels = y_test
                , stats_name = 'LinearSVC_tfidf_12___{}'.format(max_features)
                , model_name = 'LinearSVC_tfidf_12___{}'.format(max_features))

In [102]:
classifier = LinearSVC(penalty='l1', loss='l2', dual=False, max_iter=2000, tol=1e-05, class_weight='balanced')

In [None]:
findMaxFeatures(50000)


 Max Features:  50000




In [43]:
findMaxFeatures(100000)


 Max Features:  100000
Testing data accuracy:  0.9198578603617017


In [44]:
findMaxFeatures(200000)


 Max Features:  200000
Testing data accuracy:  0.9201754908383459
