In [1]:
import py_vncorenlp
import torch
from transformers import AutoModel, AutoTokenizer

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.layers import *

from tqdm import tqdm
import numpy as np
import os

phobert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
# Automatically download VnCoreNLP components from the original repository
# and save them in some local machine folder
py_vncorenlp.download_model(save_dir='/home/tlukay/thuattoanthongminh/source/vncorenlp')
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/tlukay/thuattoanthongminh/source/vncorenlp')

AlreadyExistsError: Another metric with the same name already exists.

In [2]:
class FileReader(object):
    def __init__(self, filePath, encoder = None):
        self.filePath = filePath
        self.encoder = encoder if encoder != None else 'utf-16le'

    def read_stopwords(self):
        with open(self.filePath, 'r') as f:
            stopwords = set([w.strip().replace(' ', '_') for w in f.readlines()])
        return stopwords


<h2> Data Preprocessing </h2>

In [3]:
import settings
import re
class NLP(object):
    def __init__(self, text = None):
        self.text = text
        self.__set_stopwords()

    def __set_stopwords(self):
        self.stopwords = FileReader(settings.STOP_WORDS).read_stopwords()

    def segmentation(self):
        return rdrsegmenter.word_segment(self.text)

    def split_words(self):
        text = self.segmentation()
        text = ' '.join(text)
        try:
            return [x.strip(settings.SPECIAL_CHARACTER).lower() for x in text.split()]
        except TypeError:
            return []

    def standardize_data(self):
        self.text = self.text.replace('\n',' ').lower().strip()
        self.text = re.sub(r"[\,\?]+$-()!*=._", "", self.text)
        self.text = self.text.replace(",", " ") \
            .replace(";", " ").replace("“", " ") \
            .replace(":", " ").replace("”", " ") \
            .replace('"', " ").replace("'", " ") \
            .replace("!", " ").replace("?", " ") \
            .replace("-", " ").replace("*", " ")\
            .replace("=", " ").replace("(", " ")\
            .replace(")", " ").replace("_", " ").replace(".", " ")
        self.text = self.text.strip().lower()
        return self.text      

    def get_words_feature(self):
        split_words = self.standardize_data()
        split_words = self.split_words()
        
        return [word for word in split_words if word not in self.stopwords]

In [4]:
dir_path = os.path.dirname(os.path.realpath(os.getcwd()))
dir_path = os.path.join(dir_path, 'Data')

# Load data from dataset folder
def get_data(folder_path):
    X = []
    y = []
    dirs = os.listdir(folder_path)
    for path in dirs:
        file_paths = os.listdir(os.path.join(folder_path, path))
        for file_path in tqdm(file_paths):
            with open(os.path.join(folder_path, path, file_path), 'r', encoding="utf-16") as f:
                lines = f.readlines()
                lines = ' '.join(lines)
                lines = NLP(text = lines).get_words_feature()
                lines = ' '.join(lines)
                X.append(lines)
                y.append(path)
                
    return X, y

train_path = os.path.join(dir_path, '/home/tlukay/thuattoanthongminh/source/data/10_cate/train')
X_data, y_data = get_data(train_path)


 19%|█▉        | 983/5219 [00:10<00:36, 117.31it/s]

KeyboardInterrupt: 

In [None]:
import pickle

pickle.dump(X_data, open('/home/tlukay/thuattoanthongminh/source/data/X_data.pkl', 'wb'))
pickle.dump(y_data, open('/home/tlukay/thuattoanthongminh/source/data/y_data.pkl', 'wb'))


In [None]:
test_path = os.path.join(dir_path, '/home/tlukay/thuattoanthongminh/source/data/10_cate/test')
X_test, y_test = get_data(test_path)

100%|██████████| 7567/7567 [00:55<00:00, 137.35it/s]
100%|██████████| 4560/4560 [00:30<00:00, 149.28it/s]
100%|██████████| 6716/6716 [00:41<00:00, 162.05it/s]
100%|██████████| 5417/5417 [00:43<00:00, 123.68it/s]
100%|██████████| 2096/2096 [00:16<00:00, 124.13it/s]
100%|██████████| 2036/2036 [00:21<00:00, 93.25it/s] 
100%|██████████| 6667/6667 [01:10<00:00, 94.03it/s] 
100%|██████████| 6250/6250 [00:50<00:00, 122.84it/s]
100%|██████████| 5276/5276 [00:42<00:00, 124.72it/s]
100%|██████████| 3788/3788 [00:29<00:00, 127.77it/s]


In [8]:
pickle.dump(X_test, open('/home/tlukay/thuattoanthongminh/source/data/X_test.pkl', 'wb'))
pickle.dump(y_test, open('/home/tlukay/thuattoanthongminh/source/data/y_test.pkl', 'wb'))



<h2> Feature Engineering </h2>

In this step, raw text data will be transformed into eature vectors and new features will be created using the existing dataset. We will implement some idea as follows:

1. Count Vectors as features
2. TF-IDF Vectors as features

    2.1 Word level

    2.2. N-Gram level

    2.3. Character level
    
3. Word Embeddings as features
4. Text / NLP based features
5. Topic Models as features

In [5]:
import pickle

X_data = pickle.load(open('/home/tlukay/thuattoanthongminh/source/data/X_data.pkl', 'rb'))
y_data = pickle.load(open('/home/tlukay/thuattoanthongminh/source/data/y_data.pkl', 'rb'))

X_test = pickle.load(open('/home/tlukay/thuattoanthongminh/source/data/X_test.pkl', 'rb'))
y_test = pickle.load(open('/home/tlukay/thuattoanthongminh/source/data/y_test.pkl', 'rb'))


<h3>Count Vectors as features</h3>

Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document.


In [7]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X_data)

# transform the training and validation data using count vectorizer object
X_data_count = count_vect.transform(X_data)
X_test_count = count_vect.transform(X_test)


<h3>TF-IDF Vectors</h3>

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
TF-IDF Vectors can be generated at different levels of input tokens (words, characters, n-grams)

a. Word Level TF-IDF : Matrix representing tf-idf scores of every term in different documents

b. N-gram Level TF-IDF : N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams

c. Character Level TF-IDF : Matrix representing tf-idf scores of character level n-grams in the corpus


In [8]:
# word level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(X_data) # learn vocabulary and idf from training set
X_data_tfidf =  tfidf_vect.transform(X_data)
# assume that we don't have test set before
X_test_tfidf =  tfidf_vect.transform(X_test)

In [9]:
tfidf_vect.get_feature_names()

['_kiều',
 '_mai_công',
 '_sỹ',
 'a_dua',
 'a_gia',
 'a_giao',
 'a_vương',
 'aa',
 'aaa',
 'aac',
 'aachen',
 'aaron',
 'aas',
 'ab',
 'aba',
 'abashidze',
 'abba',
 'abbas',
 'abbey',
 'abbiati',
 'abbondanzieri',
 'abbott',
 'abc',
 'abd',
 'abdel',
 'abdelrahim',
 'abdoulaye',
 'abdul',
 'abdulaziz',
 'abdullah',
 'abe',
 'abel',
 'aberdeen',
 'abeyie',
 'abf',
 'abidjan',
 'abkhazia',
 'able2extract',
 'abn',
 'about',
 'abqaiq',
 'abraham',
 'abramoff',
 'abramovich',
 'abs',
 'abtc',
 'abu',
 'ac',
 'academy',
 'acasiete',
 'acb',
 'acbs',
 'accc',
 'accept',
 'access',
 'account',
 'accumbens',
 'ace',
 'aceh',
 'acer',
 'acetaminophen',
 'achilefu',
 'achilles',
 'acid',
 'acid_amin',
 'acid_béo',
 'acid_folic',
 'acl',
 'acm',
 'acoo',
 'acpe',
 'acrobat',
 'acronis',
 'acropolis',
 'acrylic',
 'act',
 'action',
 'active',
 'activex',
 'acuff',
 'acyclovir',
 'ad',
 'adam',
 'adams',
 'adan',
 'adani',
 'adapter',
 'adb',
 'add',
 'address',
 'addvote',
 'adebayor',
 'adel',
 

In [10]:
# ngram level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(2, 3))
tfidf_vect_ngram.fit(X_data)
X_data_tfidf_ngram =  tfidf_vect_ngram.transform(X_data)
# assume that we don't have test set before
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)

In [11]:
tfidf_vect_ngram.get_feature_names()

['abdul aziz',
 'abu ghraib',
 'abu musab',
 'abu musab al',
 'ac milan',
 'ac milan as',
 'ac milan inter',
 'ac milan juventus',
 'ac milan thắng',
 'ac milan thủ_môn',
 'ac milan đấu',
 'acb bình',
 'acb bình dương',
 'acb hà_nội',
 'acb slna',
 'acb slna bình',
 'acid uric',
 'acyclovir mg',
 'adrian mutu',
 'adriano inter',
 'adriano inter milan',
 'afc champions',
 'afc champions league',
 'agribank cup',
 'agu casmir',
 'ahmed korei',
 'ai_cập cổ',
 'ai_cập cổ_đại',
 'air france',
 'ajax amsterdam',
 'ajax tel',
 'ajax tel aviv',
 'al aqsa',
 'al ittihad',
 'al jazeera',
 'al jazeera phát',
 'al qaeda',
 'al qeada',
 'al salaam',
 'al sunna',
 'al zarqawi',
 'alam shah',
 'alan curbishley',
 'alan greenspan',
 'alan shearer',
 'alan shearer newcastle',
 'alan smith',
 'albert einstein',
 'albert luque',
 'alberto gilardino',
 'alberto gilardino parma',
 'album album',
 'album ca_khúc',
 'album ca_sĩ',
 'album hát',
 'album nhạc',
 'album phát_hành',
 'album ra_mắt',
 'album vol'

In [15]:
# ngram-char level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect_ngram_char = TfidfVectorizer(analyzer='char', max_features=30000, ngram_range=(2, 3))
tfidf_vect_ngram_char.fit(X_data)
X_data_tfidf_ngram_char =  tfidf_vect_ngram_char.transform(X_data)
# assume that we don't have test set before
X_test_tfidf_ngram_char =  tfidf_vect_ngram_char.transform(X_test)

Transform by SVD to decrease number of dimensions

In [12]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=300, random_state=42)
svd.fit(X_data_tfidf)

: 

: 

In [None]:
X_data_tfidf_svd = svd.transform(X_data_tfidf)
X_test_tfidf_svd = svd.transform(X_test_tfidf)

ngram Level

In [None]:
svd_ngram = TruncatedSVD(n_components=300, random_state=42)
svd_ngram.fit(X_data_tfidf_ngram)

In [None]:
X_data_tfidf_ngram_svd = svd_ngram.transform(X_data_tfidf_ngram)
X_test_tfidf_ngram_svd = svd_ngram.transform(X_test_tfidf_ngram)

ngram Char Level

In [None]:
svd_ngram_char = TruncatedSVD(n_components=300, random_state=42)
svd_ngram_char.fit(X_data_tfidf_ngram_char)

NameError: name 'TruncatedSVD' is not defined

In [None]:
X_data_tfidf_ngram_char_svd = svd_ngram_char.transform(X_data_tfidf_ngram_char)
X_test_tfidf_ngram_char_svd = svd_ngram_char.transform(X_test_tfidf_ngram_char)


<h3>Word Embeddings</h3>

We will convert each word in document to a embedding vector. We will use pretrained model for Vietnamese. The model can be downloaded from https://github.com/Kyubyong/wordvectors

Assume that, one document have
word, each word is represented by 300 dimensional vector, then the document vector be 2-dimensional matrix with size . From that, we can use DNN, RNN, CNN model for this type of data.

In [None]:
from gensim.models import KeyedVectors 
dir_path = os.path.dirname(os.path.realpath(os.getcwd()))
word2vec_model_path = os.path.join(dir_path, "Data/vi/vi.vec")

w2v = KeyedVectors.load_word2vec_format(word2vec_model_path)
vocab = w2v.wv.vocab
wv = w2v.wv

In [None]:
def get_word2vec_data(X):
    word2vec_data = []
    for x in X:
        sentence = []
        for word in x.split(" "):
            if word in vocab:
#                 print(word)
                sentence.append(wv[word])

        word2vec_data.append(sentence)
#         break
    return word2vec_data

X_data_w2v = get_word2vec_data(X_data)
X_test_w2v = get_word2vec_data(X_test)

<h3>Convert y to categorical</h3>

In [None]:
encoder = preprocessing.LabelEncoder()
y_data_n = encoder.fit_transform(y_data)
y_test_n = encoder.fit_transform(y_test)

In [None]:
encoder.classes_

<h2>Model</h2>
We will implement these models:

1. Naive Bayes Classifier
2. Linear Classifier
3. Support Vector Machine
4. Bagging Models
5. Boosting Models
6. Shallow Neural Networks
7. Deep Neural Networks
- Convolutional Neural Network (CNN)
- Long Short Term Modelr (LSTM)
- Gated Recurrent Unit (GRU)
- Bidirectional RNN
- Recurrent Convolutional Neural Network (RCNN)
- Other Variants of Deep Neural Networks
8. Doc2Vec model


In [None]:
from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.1, random_state=42)
    
    if is_neuralnet:
        classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=n_epochs, batch_size=512)
        
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)
        val_predictions = val_predictions.argmax(axis=-1)
        test_predictions = test_predictions.argmax(axis=-1)
    else:
        classifier.fit(X_train, y_train)
    
        train_predictions = classifier.predict(X_train)
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)
        
    print("Validation accuracy: ", metrics.accuracy_score(val_predictions, y_val))
    print("Test accuracy: ", metrics.accuracy_score(test_predictions, y_test))


<h2>Naive Bayes</h2>

In [None]:
train_model(naive_bayes.MultinomialNB(), X_data_tfidf, y_data, X_test_tfidf, y_test, is_neuralnet=False)

In [None]:
train_model(naive_bayes.MultinomialNB(), X_data_tfidf_ngram_svd, y_data, X_test_tfidf_ngram_svd, y_test, is_neuralnet=False)

In [None]:
train_model(naive_bayes.MultinomialNB(), X_data_tfidf_ngram_char_svd, y_data, X_test_tfidf_ngram_char_svd, y_test, is_neuralnet=False)

<h3>Other type Naive Bayes</h3>

In [None]:
train_model(naive_bayes.BernoulliNB(), X_data_tfidf, y_data, X_test_tfidf, y_test, is_neuralnet=False)

In [None]:
train_model(naive_bayes.BernoulliNB(), X_data_tfidf_svd, y_data, X_test_tfidf_svd, y_test, is_neuralnet=False)

<h3>Linear Classifier</h3>

In [None]:
train_model(linear_model.LogisticRegression(), X_data_tfidf, y_data, X_test_tfidf, y_test, is_neuralnet=False)

In [None]:
train_model(linear_model.LogisticRegression(), X_data_tfidf_svd, y_data, X_test_tfidf_svd, y_test, is_neuralnet=False)

<h3>SVM Model</h3>

In [None]:
train_model(svm.SVC(), X_data_tfidf_svd, y_data, X_test_tfidf_svd, y_test, is_neuralnet=False)

<h3>Bagging Model</h3>

In [None]:
train_model(ensemble.RandomForestClassifier(), X_data_tfidf_svd, y_data, X_test_tfidf_svd, y_test, is_neuralnet=False)

<h3>
Boosting Model</h3>

In [None]:
train_model(xgboost.XGBClassifier(), X_data_tfidf_svd, y_data, X_test_tfidf_svd, y_test, is_neuralnet=False)


<h3>
Deep Neural Network</h3>

In [None]:
from keras.layers import *
def create_dnn_model():
    input_layer = Input(shape=(300,))
    layer = Dense(1024, activation='relu')(input_layer)
    layer = Dense(1024, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    output_layer = Dense(10, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier


In [None]:

classifier = create_dnn_model()
train_model(classifier=classifier, X_data=X_data_tfidf_svd, y_data=y_data_n, X_test=X_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True)

<h3>Recurrent Neural Network</h3>
<h4>LSTM</h4>


In [None]:
def create_lstm_model():
    input_layer = Input(shape=(300,))
    
    layer = Reshape((10, 30))(input_layer)
    layer = LSTM(128, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)
    
    output_layer = Dense(10, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier


In [None]:
classifier = create_lstm_model()
train_model(classifier=classifier, X_data=X_data_tfidf_svd, y_data=y_data_n, X_test=X_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True)

<h4>
Bidirectional RNN</h4>

In [None]:
def create_brnn_model():
    input_layer = Input(shape=(300,))
    
    layer = Reshape((10, 30))(input_layer)
    layer = Bidirectional(GRU(128, activation='relu'))(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)
    
    output_layer = Dense(10, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier


In [None]:
classifier = create_brnn_model()
train_model(classifier=classifier, X_data=X_data_tfidf_svd, y_data=y_data_n, X_test=X_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=20)