# Download dataset

In [1]:
!git clone https://github.com/ThanhChinhBK/vietnews.git

Cloning into 'VNTC'...
remote: Enumerating objects: 39, done.[K
remote: Total 39 (delta 0), reused 0 (delta 0), pack-reused 39 (from 1)[K
Receiving objects: 100% (39/39), 160.90 MiB | 39.17 MiB/s, done.
Resolving deltas: 100% (4/4), done.
Updating files: 100% (15/15), done.
Filtering content: 100% (2/2), 168.95 MiB | 69.42 MiB/s, done.


In [2]:
!apt install unrar > /dev/null





In [3]:
!unrar x -inul /kaggle/working/VNTC/Data/27Topics/Ver1.1/Train.rar /kaggle/working/data/train/

In [4]:
!unrar x -inul /kaggle/working/VNTC/Data/27Topics/Ver1.1/Test.rar /kaggle/working/data/test/

In [5]:
!ls /kaggle/working/data/

test  train


In [6]:
!ls "/kaggle/working/data/train/new train/"

'Am nhac'	    'Duong vao WTO'	  'Lam dep'
'Am thuc'	    'Gia dinh'		  'Loi song'
'Bat dong san'	    'Giai tri tin hoc'	  'Mua sam'
'Bong da'	    'Giao duc'		  'My thuat'
'Chung khoan'	    'Gioi tinh'		  'San khau dien anh'
'Cum ga'	    'Hackers va Virus'	  'San pham tin hoc moi'
'Cuoc song do day'  'Hinh su'		   Tennis
'Du hoc'	    'Khong gian song'	  'The gioi tre'
'Du lich'	    'Kinh doanh quoc te'  'Thoi trang'


In [None]:
!ls "/kaggle/working/data/test/new test/"

In [None]:
!ls "/kaggle/working/data/train/new train/Am nhac" | head -3

# Import

In [None]:
!pip install -q underthesea

In [None]:
import os
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from underthesea import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import AutoModel, AutoTokenizer
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Preprocessing data

In [None]:
TRAIN_DIR='/kaggle/working/data/train/new train/'
TEST_DIR='/kaggle/working/data/test/new test/'

In [None]:
X_train, X_test, y_train, y_test = [], [], [], []
for cat in os.listdir(TRAIN_DIR):
    cat_path = os.path.join(TRAIN_DIR, cat)
    for text_file in os.listdir(cat_path):
        f = open(os.path.join(cat_path, text_file), 'r', encoding='utf-16')
        text = f.read()
        X_train.append(text)
        y_train.append(cat)
        
for cat in os.listdir(TEST_DIR):
    cat_path = os.path.join(TEST_DIR, cat)
    for text_file in os.listdir(cat_path):
        f = open(os.path.join(cat_path, text_file), 'r', encoding='utf-16')
        text = f.read()
        X_test.append(text)
        y_test.append(cat)

In [None]:
X_train[0]

In [None]:
y_train[0]

In [None]:
print(len(X_train))
print(len(X_test))

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
fig, ax = plt.subplots(figsize =(16, 9))

ax.barh(unique, counts)

for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)

ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')

ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)

ax.grid(color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)

ax.invert_yaxis()

for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5, 
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')

ax.set_title('Label distribution')

plt.show()

In [None]:
print(len(unique))

# Preprocessing data

In [None]:
def normalize(text):
    # remove html
    text = re.sub(r'<[^>]*>', '', text)
    
    # remove punctuation
    text = re.sub(f'[{string.punctuation}—℅\d]', '', text)
    
    # remove end line
    text = re.sub(f'[\n]', ' ', text)
    
    # lowercase
    text = text.lower()
    
    # remove redundant white space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
X_train = [normalize(text) for text in X_train]
X_test = [normalize(text) for text in X_test]

In [None]:
X_train[0]

In [None]:
# tfidf = TfidfVectorizer(tokenizer=word_tokenize)
# X_train = tfidf.fit_transform(X_train)
# X_test = tfidf.transform(X_test)
# tfidf_features = tfidf.get_feature_names_out()
# print(tfidf_features)

In [None]:
X_train = [word_tokenize(doc, format='text') for doc in X_train]
X_test = [word_tokenize(doc, format='text') for doc in X_test]

In [None]:
X_train[0]

In [None]:
phobert = AutoModel.from_pretrained("vinai/phobert-base-v2").to('cuda')
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2").to('cuda')

def phobert_embedding(documents):
    embeddings = []
    for doc in documents:
        input_ids = tokenizer.encode(doc, return_tensors='pt', padding=True, truncation=True, max_length=256)
        with torch.no_grad():
            features = phobert(input_ids)
            embeddings.append(features.last_hidden_state.mean(dim=1).squeeze().tolist())
    return embeddings

In [None]:
X_train_embeddings = np.array(phobert_embedding(X_train))
X_test_embeddings = np.array(phobert_embedding(X_test))

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=27)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=27)

In [None]:
X_train_embeddings, X_valid_embeddings, y_train, y_val = train_test_split(X_train_embeddings, y_train, test_size=0.1, random_state=42, stratify=y_train)

In [None]:
input_size = 768  
num_classes = 27

model = models.Sequential([
    layers.InputLayer(input_shape=(input_size,)),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(X_train_embeddings, y_train, 
                    validation_data=(X_valid_embeddings, y_val), 
                    epochs=30, 
                    batch_size=32)

In [None]:
model.save('/kaggle/working/text_classify.h5')

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
predictions = model.predict(X_test_embeddings)
y_pred = np.argmax(predictions, axis=1)
y_test = np.argmax(y_test, axis=1)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

In [None]:
print(classification_report(y_test, y_pred, target_names=list(unique)))