In [1]:
from pyvi import ViTokenizer, ViPosTagger # thư viện NLP tiếng Việt
from tqdm import tqdm
import numpy as np
import gensim # thư viện NLP
from pathlib import Path

In [2]:
import os 
dir_path = os.path.dirname(os.path.realpath(os.getcwd()))
dir_path = os.path.join(dir_path, 'Data')

def get_data(folder_path):
    X = []
    y = []
    dirs = os.listdir(folder_path)
    for path in tqdm(dirs):
        file_paths = os.listdir(os.path.join(folder_path, path))
        for file_path in tqdm(file_paths):
            with open(os.path.join(folder_path, path, file_path), 'r', encoding="utf-8") as f:
                lines = f.readlines()
                lines = ' '.join(lines)
                lines = gensim.utils.simple_preprocess(lines)

                lines = ' '.join(lines)
                lines = ViTokenizer.tokenize(lines)
                
                X.append(lines)
                y.append(path)

    return X, y

train_path = os.path.join(dir_path, 'D:\Private\DATA\DATA_LAW\TRAIN_FULL')
X_data, y_data = get_data(train_path)


100%|██████████| 256/256 [00:02<00:00, 88.30it/s]
100%|██████████| 180/180 [00:03<00:00, 46.01it/s]
100%|██████████| 67/67 [00:01<00:00, 49.99it/s]
100%|██████████| 238/238 [00:03<00:00, 69.63it/s]
100%|██████████| 545/545 [00:05<00:00, 93.39it/s]
100%|██████████| 474/474 [00:07<00:00, 63.10it/s]
100%|██████████| 154/154 [00:02<00:00, 66.20it/s]
100%|██████████| 350/350 [00:04<00:00, 80.04it/s]
100%|██████████| 8/8 [00:31<00:00,  3.96s/it]


In [3]:
test_path = os.path.join(dir_path, 'D:\Private\DATA\DATA_LAW\TEST_FULL')
X_test, y_test = get_data(test_path)

100%|██████████| 110/110 [00:01<00:00, 88.92it/s]
100%|██████████| 78/78 [00:01<00:00, 46.79it/s]
100%|██████████| 29/29 [00:00<00:00, 44.99it/s]
100%|██████████| 103/103 [00:01<00:00, 71.83it/s]
100%|██████████| 234/234 [00:02<00:00, 83.61it/s]
100%|██████████| 204/204 [00:03<00:00, 56.63it/s]
100%|██████████| 67/67 [00:01<00:00, 59.63it/s]
100%|██████████| 151/151 [00:01<00:00, 88.89it/s]
100%|██████████| 8/8 [00:14<00:00,  1.78s/it]


In [4]:
def save_to_file(data,file_path):
    with open(file_path,'w',encoding='utf-8') as f:
        for line in data:
            f.write(line + "\n")

save_to_file(X_data,'D://Private//DATA//saved_data//x_train_data.txt')
save_to_file(y_data, 'D://Private//DATA//saved_data//y_train_data.txt')


save_to_file(X_test,'D://Private//DATA//saved_data//x_test_data.txt')
save_to_file(y_test, 'D://Private//DATA//saved_data//y_test_data.txt')

In [5]:
import pickle

pickle.dump(X_data, open('D:/Private/DATA/saved_data/X_data.pkl', 'wb'))
pickle.dump(y_data, open('D:/Private/DATA/saved_data/y_data.pkl', 'wb'))

pickle.dump(X_test, open('D:/Private/DATA/saved_data/X_test.pkl', 'wb'))
pickle.dump(y_test, open('D:/Private/DATA/saved_data/y_test.pkl', 'wb'))

Feature Engineering

In [6]:
import pickle

X_data = pickle.load(open('D:/Private/DATA/saved_data/X_data.pkl', 'rb'))
y_data = pickle.load(open('D:/Private/DATA/saved_data/y_data.pkl', 'rb'))

X_test = pickle.load(open('D:/Private/DATA/saved_data/X_test.pkl', 'rb'))
y_test = pickle.load(open('D:/Private/DATA/saved_data/y_test.pkl', 'rb'))


WAY 1: Count Vectors as features

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X_data)

# transform the training and validation data using count vectorizer object
X_data_count = count_vect.transform(X_data)
X_test_count = count_vect.transform(X_test)


WAY 2: 
Tf-Idf Vectors as Features

Word level

In [8]:
# word level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(X_data) # learn vocabulary and idf from training set
X_data_tfidf =  tfidf_vect.transform(X_data)
# assume that we don't have test set before
X_test_tfidf =  tfidf_vect.transform(X_test)


In [29]:
X = list(X_test_tfidf)
X_array = np.array(X)
print(X_array.shape)

(976,)


Ngram_level

In [9]:
# ngram level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(2, 3))
tfidf_vect_ngram.fit(X_data)
X_data_tfidf_ngram =  tfidf_vect_ngram.transform(X_data)
# assume that we don't have test set before
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)


 SVD (singular value decomposition)

Word Level

In [10]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300, random_state=42)
svd.fit(X_data_tfidf)

X_data_tfidf_svd = svd.transform(X_data_tfidf)
X_test_tfidf_svd = svd.transform(X_test_tfidf)


In [26]:
X = list(X_test_tfidf_svd)
X_array = np.array(X)
print(X_array.shape)

(976, 300)


N-gram Level

In [21]:
svd_ngram = TruncatedSVD(n_components=300, random_state=42)
svd_ngram.fit(X_data_tfidf_ngram)

X_data_tfidf_ngram_svd = svd_ngram.transform(X_data_tfidf_ngram)
X_test_tfidf_ngram_svd = svd_ngram.transform(X_test_tfidf_ngram)


In [24]:
X = list(X_test_tfidf_ngram_svd)
X_array = np.array(X)
print(X_array.shape)

(976, 300)


Label Encoder

In [12]:
from sklearn import preprocessing

In [13]:
encoder = preprocessing.LabelEncoder()
y_data_n = encoder.fit_transform(y_data)
y_test_n = encoder.fit_transform(y_test)

encoder.classes_ 

array(['dan_su', 'dat_dai', 'dau_thau', 'hang_hai', 'hinh_su', 'kinh_te',
       'lao_dong', 'to_tung_hinh_su'], dtype='<U15')

TRAIN MODEL

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split
from sklearn import metrics



In [15]:
def train_model(classifier, X_data, y_data, X_test, y_test, is_neuralnet=False, n_epochs=1000,verbose=10):       
    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.1, random_state=42)
    
    if is_neuralnet:
        classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=n_epochs,verbose=verbose, batch_size=512)
        
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)
        val_predictions = val_predictions.argmax(axis=-1)
        test_predictions = test_predictions.argmax(axis=-1)
    else:
        classifier.fit(X_train, y_train)
    
        train_predictions = classifier.predict(X_train)
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)

        
    print("Validation accuracy: ", metrics.accuracy_score(val_predictions, y_val))
    print("Test accuracy: ", metrics.accuracy_score(test_predictions, y_test))


NAIVE_BAYES

In [36]:
train_model(naive_bayes.MultinomialNB(), X_data_tfidf_ngram_svd, y_data, X_test_tfidf_ngram_svd, y_test, is_neuralnet=False)




ValueError: Negative values in data passed to MultinomialNB (input X)

LOGISTIC REGRESSION

In [16]:
print("LogisticRegression: ")

X_data, y_data, X_test, y_test = X_data_tfidf, y_data, X_test_tfidf, y_test
classifier = LogisticRegression()
model = train_model(classifier,X_data, y_data, X_test, y_test)


#Prediction
name_file = "bbbb.txt"
input_file = 'D:\\Private\\DATA\\Model\\' + name_file
with open(input_file,'r',encoding='utf-8') as f:
    text = f.read()
text = ViTokenizer.tokenize(text)
#input la doc []
text_to_vect = tfidf_vect.transform([text])
label = classifier.predict(text_to_vect)
print(f'name_file:{name_file}, with label:{label}')



LogisticRegression: 


Validation accuracy:  0.9118942731277533
Test accuracy:  0.8790983606557377
name_file:bbbb.txt, with label:['dau_thau']


NEURAL NETWORK

In [35]:

import tensorflow as tf
from tensorflow.keras.layers import Input, Reshape, LSTM, Dense,Bidirectional,GRU
from tensorflow.keras import models, optimizers


LSTM

In [18]:

def create_lstm_model():
    input_layer = Input(shape=(300,))
    
    layer = Reshape((10, 30))(input_layer)
    
    layer = LSTM(128, return_sequences=False)(layer)
    
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)
    
    output_layer = Dense(10, activation='softmax')(layer)
    
    classifier = models.Model(inputs=input_layer, outputs=output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier


In [32]:
print("LSTM: ")

X_data, y_data, X_test, y_test = X_data_tfidf_ngram_svd, y_data_n, X_test_tfidf_ngram_svd, y_test_n
classifier = create_lstm_model()
model = train_model(classifier,X_data, y_data, X_test, y_test,is_neuralnet=True)


LSTM: 
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72

In [None]:
#Prediction
name_file = "bbbb.txt"
input_file = 'D:\\Private\\DATA\\Model\\' + name_file
with open(input_file,'r',encoding='utf-8') as f:
    text = f.read()
text = ViTokenizer.tokenize(text)
text_to_vect = tfidf_vect.transform([text]).toarray()
text_to_svd_vect = svd.transform(text_to_vect)

label = classifier.predict(text_to_svd_vect)

class_names = encoder.classes_

predicted_class_index = np.argmax(label)
predicted_label = class_names[predicted_class_index]

print(f'Predicted label: {predicted_label}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted label: dau_thau


BRNN MODEL

In [None]:
def create_brnn_model():
    input_layer = Input(shape=(300,))
    
    layer = Reshape((10, 30))(input_layer)
    layer = Bidirectional(GRU(128, activation='relu'))(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)
    
    output_layer = Dense(10, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier


In [None]:
print("BRNN: ")

X_data, y_data, X_test, y_test = X_data_tfidf_svd, y_data_n, X_test_tfidf_svd, y_test_n
classifier = create_brnn_model()
model = train_model(classifier,X_data, y_data, X_test, y_test,is_neuralnet=True)



BRNN: 


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [None]:
#Prediction
name_file = "aaa.txt"
input_file = 'D:\\Private\\DATA\\Model\\' + name_file
with open(input_file,'r',encoding='utf-8') as f:
    text = f.read()
    
text = ViTokenizer.tokenize(text)
text_to_vect = tfidf_vect.transform([text]).toarray()
text_to_svd_vect = svd.transform(text_to_vect)

label = classifier.predict(text_to_svd_vect)

class_names = encoder.classes_

predicted_class_index = np.argmax(label)
predicted_label = class_names[predicted_class_index]

print(f'Predicted label: {predicted_label}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted label: kinh_te
