In [1]:
import os
import re

In [2]:
train_path = r"C:\Users\Nhat Tan\Downloads\Train_Full\Train_Full"
test_path = r"C:\Users\Nhat Tan\Downloads\Test_Full\Test_Full"

In [3]:
def read_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = [line.strip().lower() for line in file]
        return stopwords

In [4]:
stopwords_path = r"C:\Users\Nhat Tan\Downloads\vietnamese-stopwords.txt"
stopwords_list = read_stopwords(stopwords_path)

In [5]:
from underthesea import word_tokenize

def load_data(directory):
    data = []
    labels = [] 
    topics = os.listdir(directory)
    for topic in topics:
        topic_path = os.path.join(directory, topic)
        if os.path.isdir(topic_path): 
            for filename in os.listdir(topic_path):
                if filename.endswith(".txt"):
                    filepath = os.path.join(topic_path, filename)
                    with open(filepath, 'r', encoding='utf-16') as file:
                        text = file.read()
                        text = text.lower()
                        text = re.sub(r'[^\w\s]', '', text)

                        tokens = word_tokenize(text, format="text")
                        tokens = [word for word in tokens.split() if word not in stopwords_list]

                        text = " ".join(tokens)
                        data.append(text)
                        labels.append(topic)
    return data, labels

In [6]:
train_data, train_labels = load_data(train_path)

In [7]:
train_data[:2]

['thành_lập dự_án policy phòng_chống hivaids vn nlđ quỹ hỗ_trợ khẩn_cấp aids hoa_kỳ thành_lập dự_án policy vn cam_kết hỗ_trợ chính_phủ nhân_dân vn đối_phó hivaidsdự_án nhiệm_vụ cải_thiện công_tác phòng_chống hivaids thông_qua lĩnh_vực xây_dựng chính_sách rà_soát văn_bản pháp_luật xây_dựng chiến_lược quảng_bá xây_dựng chương_trình đào_tạo phòng_chống hivaids kế_hoạch bố_trí nguồn_lực huấn_luyện nghiên_cứu phương_tiện truyền_thông đại_chúng tổ_chức hoạt_động kỳ_thị phân_biệt đối_xử đối_với hivaids ttxvn dự_án policy đặc_biệt quan_tâm công_tác truyền_thông phòng_chống hivaids coi biện_pháp tích_cực hữu_hiệu phòng_chống hiệu_quả hivaids thời_gian dự_án policy tiếp_tục tổ_chức hoạt_động nâng nhận_thức trách_nhiệm công_tác chỉ_đạo phòng_chống hivaids',
 '16000 vịnh nha_trang trực_ban bộ_đội biên_phòng cảng du_lịch cầu đá vĩnh_nguyên lễ vừa_qua 16000 tham_quan vịnh nha_trang 734 du_khách nước_ngoài đông 304 vịnh nha_trang đón 7019 du_khách địa_chỉ vịnh nha_trang thu_hút số_lượng du_khách tham

In [8]:
from gensim.models import Word2Vec

sentences = [text.split() for text in train_data]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")

In [9]:
print(word2vec_model)

Word2Vec<vocab=251719, vector_size=100, alpha=0.025>


In [10]:
vector = word2vec_model.wv['vi_phạm']  # get numpy vector of a word
sims = word2vec_model.wv.most_similar('vi_phạm', topn=10)
sims

[('xử_phạt', 0.8153647184371948),
 ('chấp_hành', 0.7613055109977722),
 ('chế_tài', 0.7612482309341431),
 ('quy_định', 0.7295352816581726),
 ('tái_phạm', 0.7239127159118652),
 ('hành_chính', 0.7046078443527222),
 ('nghiêm', 0.7013803720474243),
 ('nghiêm_cấm', 0.6990323066711426),
 ('nội_quy', 0.6943936944007874),
 ('quy_chế', 0.6791990995407104)]

In [11]:
test_data, test_labels = load_data(test_path)

In [15]:
print(test_data[:2])

['mạo_hiểm rừng đa_mi hành_quân khám_phá thác sương_mù ẩn cánh rừng đa_mi hàm_thuận bắc bình_thuận bạt_ngàn hồ thủy điện đa_mi đẹp nàng công_chúa thác sương_mù hùng_vĩ ngất_uốn_lượn rồng_bạc khổng_lồ xuyên rừng thác sương_mù trời chiều chúng_tôi hồ thủy điện đa_mi rộng 625 ha hòn đảo đẹp tranh thủy_mạc dần hiện ánh hoàng_hôn huyền_ảo lưu_trú đêm đầu_tiên chúng_tôi khách_sạn ngàn sân_bay dã_chiến nhà_máy thủy_điện tất_cả dịch_vụ lo thủy_điện hạ_trại điện đêm bốn bề tối đen mực lửa bùng bất_ngờ hoài nhưỡng bí_thư chi_đoàn xã đa_mi thanh_niên xã 20 km đường đèo thăm đầu_tiên đoàn thành_phố đêm bọn rủ đi liền lời_ca_tiếng hát vang tiếng đàn guitar chập_chùng đêm rừng ấm lẫn chủ tầng 1 thác sương_mù trời tờ_mờ tiếng chim hót véo_von gọi thức_giấc đi tắm hồ chuẩn_bị sức_khỏe hành_trình xuyên rừng chạy khởi_động đôi chân hành_lý gọn_nhẹ thông_báo chuẩn_bị khám_phá thác sương_mù hành_quân xuyên rừng thác 5 km đường_mòn thử sức đôi chân ngó vực sâu hun_hút toán hướng_dẫn băng đường lối chân thá

In [16]:
def text_to_vector(text, model):
    # Tokenize the text (ensure that your text is tokenized already)
    tokens = text.split()
    
    # Initialize a list to store the word vectors
    word_vectors = []
    
    for word in tokens:
        if word in model.wv:  # Check if word exists in the Word2Vec vocabulary
            word_vectors.append(model.wv[word])
    
    # If there are valid word vectors, return their average, otherwise return zeros
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [18]:
import numpy as np
vector_size = 100

# Vectorize train and test data
X_train = np.array([text_to_vector(text, word2vec_model) for text in train_data])
X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data])

In [19]:
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [21]:
print(X_train[:2])
print(y_train[:2])

[[ 1.2631034   0.42681104 -0.05936543  0.6502154   1.5135636   0.02773987
   1.6333667   0.8016361   0.6094308   0.52670956 -0.31807047 -0.05158868
   0.07444967 -0.14723514  0.25994346 -1.089825    0.33868372 -1.2496867
   0.10198549 -0.56799126  0.28142288  0.10288019 -0.14726385  0.12671436
  -0.4844395  -0.602833   -0.58769244 -1.863195   -0.5812434   0.36238593
   0.60639846 -0.38305143 -0.48639616 -0.3122501   0.20691638  0.5860157
  -1.5436785   0.2650565   0.19334495 -0.42768988 -0.07176334  0.25877407
  -0.10819094 -0.8552596   1.4797612  -0.66410786 -0.854403   -0.45330542
  -0.49956486  0.13480034 -0.25636047  0.35741636  1.23071     0.7874002
   0.50709975 -0.59995055 -1.1149788  -0.02450802 -1.5195967  -0.48704642
   0.80567396  0.32381162  0.5210639  -1.5269195  -1.4085809  -0.48561797
   0.72158647 -0.3038811  -0.5273573  -0.70733947  0.08684669 -0.02610252
  -0.03391518 -0.21233332 -0.15124255 -0.37510225 -0.12549481  0.00438308
  -0.29981154  0.5537305   0.5940088   0.

In [35]:
X_train_combined = np.concatenate((X_train, X_val), axis=0)
y_train_combined = np.concatenate((y_train, y_val), axis=0)

print(X_train_combined.shape)
print(y_train_combined.shape)

(33759, 100)
(33759,)


In [36]:
from sklearn.metrics import accuracy_score, classification_report
def Report(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled = scaler.transform(X_test)

In [38]:
print(X_train[:2])
print(y_train[:2])

[[ 0.20713012  0.6892172  -0.04107564 -0.22420135  0.6910475  -0.4502266
   0.20160027  0.5894731   0.2698518   0.02230131 -0.30387297 -0.46725643
   0.21648869  0.18691823  0.5731317  -0.08401296  0.5783362  -0.23982118
   0.34852862 -0.76917875 -0.45552522  0.3750897  -0.22753935  0.6453554
  -0.34341934  0.1725434  -0.34902897 -0.18481028 -0.10661989  0.132446
   0.63435525 -0.25165302  0.6192655  -0.31550348 -0.26817256  0.4385681
  -0.32351488 -0.28222266 -0.17526044 -0.09072046 -0.10484883  0.46490052
   0.37714607  0.40674     0.44938275 -0.06794545 -0.17081742  0.5382455
  -0.46342427  0.42512164  0.6207291   0.29231444  0.05373027 -0.02141318
   0.39828336 -0.20859756 -0.24312052  0.31296563 -0.1998789  -0.00367341
   0.04063943  0.5368873   0.09381363  0.41265804 -0.8136403   0.647963
  -0.16768578 -0.769532   -0.07618523  0.2487806  -0.647232   -0.07670046
   0.12940708 -0.98855734 -0.7496544   0.4267364   0.07831406  0.5990688
  -0.6073192   0.43093142  0.00465997  0.112291

In [39]:
from sklearn.naive_bayes import GaussianNB

NB_model = GaussianNB()
NB_model.fit(X_train_scaled, y_train_combined)

In [45]:
y_pred = NB_model.predict(X_test_scaled)
Report(y_test, y_pred)

Accuracy: 0.7911579616064162
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.72      0.72      7567
           1       0.44      0.64      0.52      2036
           2       0.59      0.63      0.61      2096
           3       0.74      0.84      0.79      5276
           4       0.79      0.86      0.82      3788
           5       0.92      0.77      0.84      5417
           6       0.85      0.81      0.83      6716
           7       0.98      0.84      0.90      6667
           8       0.80      0.85      0.82      6250
           9       0.85      0.79      0.82      4560

    accuracy                           0.79     50373
   macro avg       0.77      0.77      0.77     50373
weighted avg       0.81      0.79      0.80     50373



In [41]:
import tensorflow as tf
print(tf.__version__)

2.6.0


In [42]:
from keras.layers import SimpleRNN, Dense, Embedding
from keras.models import Sequential

In [43]:
device = tf.config.list_physical_devices('GPU')[0]  
tf.config.set_visible_devices(device, 'GPU')
device

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

In [44]:
word2vec_model.wv.vector_size

100

In [76]:
print(X_train_scaled.shape[0],X_train_scaled.shape[1])
print(y_train_combined.shape)

33759 100
(33759,)


In [77]:
RNN_model = Sequential([
    SimpleRNN(512, return_sequences=False, input_shape=(1,100)),
    Dense(512, activation='relu'),
    # Dense Layer cho phân loại nhiều lớp
    Dense(len(set(y_train_combined)), activation='softmax')  
])

RNN_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [78]:
RNN_model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_8 (SimpleRNN)     (None, 512)               313856    
_________________________________________________________________
dense_14 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_15 (Dense)             (None, 10)                5130      
Total params: 581,642
Trainable params: 581,642
Non-trainable params: 0
_________________________________________________________________


In [80]:
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))

In [81]:
history = RNN_model.fit(X_train_reshaped, y_train_combined, epochs=5, batch_size=32, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [94]:
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, 100))

In [95]:
print(X_test_reshaped.shape)

(50373, 1, 100)


In [96]:
y_pred = RNN_model.predict(X_test_reshaped)
y_pred = np.argmax(y_pred, axis=1)
Report(y_test, y_pred)

Accuracy: 0.8728684017231453
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83      7567
           1       0.74      0.45      0.56      2036
           2       0.70      0.78      0.74      2096
           3       0.87      0.86      0.87      5276
           4       0.80      0.93      0.86      3788
           5       0.93      0.87      0.90      5417
           6       0.90      0.90      0.90      6716
           7       0.93      0.98      0.95      6667
           8       0.91      0.90      0.91      6250
           9       0.93      0.88      0.90      4560

    accuracy                           0.87     50373
   macro avg       0.85      0.84      0.84     50373
weighted avg       0.87      0.87      0.87     50373



In [115]:
from keras.layers import LSTM, Dropout

In [116]:
LSTM_model = Sequential([
    LSTM(256, input_shape=(1,100), activation='relu', return_sequences=True), 
    Dropout(0.2),
    LSTM(128, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(set(y_train_combined)), activation='softmax')  
])

LSTM_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(LSTM_model.summary())

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 1, 256)            365568    
_________________________________________________________________
dropout_12 (Dropout)         (None, 1, 256)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 32)                4128      
_________________________________________________________________
dropout_14 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 10)              

In [117]:
from tensorflow.keras.utils import to_categorical

y_train_categorized = to_categorical(y_train_combined, num_classes=10)

In [118]:
history = LSTM_model.fit(X_train_reshaped, y_train_categorized, epochs=5, batch_size=32, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [119]:
y_pred = LSTM_model.predict(X_test_reshaped)
y_pred = np.argmax(y_pred, axis=1)
Report(y_test, y_pred)

Accuracy: 0.8871220693625553
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84      7567
           1       0.73      0.58      0.65      2036
           2       0.72      0.79      0.75      2096
           3       0.92      0.81      0.87      5276
           4       0.84      0.94      0.89      3788
           5       0.93      0.90      0.91      5417
           6       0.93      0.91      0.92      6716
           7       0.97      0.97      0.97      6667
           8       0.90      0.92      0.91      6250
           9       0.89      0.93      0.91      4560

    accuracy                           0.89     50373
   macro avg       0.87      0.86      0.86     50373
weighted avg       0.89      0.89      0.89     50373



In [127]:
from keras.layers import GRU
from tensorflow.keras.optimizers import SGD

In [133]:
GRU_model = Sequential([
    GRU(units=256, return_sequences=True, input_shape=(1,100), activation='tanh'),
    Dropout(0.2),
    GRU(units=128, activation='tanh'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(set(y_train_combined)), activation='softmax')  
])
GRU_model.compile(optimizer=SGD(lr=0.01, decay=1e-7, momentum=0.9, nesterov=False),loss='mean_squared_error')
print(GRU_model.summary())

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_14 (GRU)                 (None, 1, 256)            274944    
_________________________________________________________________
dropout_36 (Dropout)         (None, 1, 256)            0         
_________________________________________________________________
gru_15 (GRU)                 (None, 128)               148224    
_________________________________________________________________
dropout_37 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 32)                4128      
_________________________________________________________________
dropout_38 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_41 (Dense)             (None, 10)              

In [134]:
GRU_model.fit(X_train_reshaped, y_train_categorized, epochs=5, batch_size=32, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1fc5c7ed400>

In [135]:
y_pred = GRU_model.predict(X_test_reshaped)
y_pred = np.argmax(y_pred, axis=1)
Report(y_test, y_pred)

Accuracy: 0.817560994977468
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.81      0.77      7567
           1       0.49      0.66      0.56      2036
           2       0.82      0.35      0.49      2096
           3       0.83      0.76      0.79      5276
           4       0.81      0.89      0.85      3788
           5       0.86      0.87      0.86      5417
           6       0.87      0.86      0.86      6716
           7       0.94      0.93      0.93      6667
           8       0.84      0.82      0.83      6250
           9       0.82      0.84      0.83      4560

    accuracy                           0.82     50373
   macro avg       0.80      0.78      0.78     50373
weighted avg       0.82      0.82      0.82     50373

