In [None]:
import gc
import os
import re
import string
from tqdm import tqdm
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.optimizers import Adam
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [None]:
DATA_PATH = '/kaggle/input/10000-vietnamese-books/output'
data = []
name = []
for filename in os.listdir(DATA_PATH)[:6000]:
    name.append(filename.split('.')[0])
    filepath = os.path.join(DATA_PATH, filename)
    f = open(filepath, 'r')
    data.append(f.read())
    f.close()

In [None]:
avg = 0
for t in data:
    avg += len(t.split())
avg = avg/len(data)
print(f'Average words per doc: {avg}')
del avg

In [None]:
name[0]

In [None]:
for i in range(len(data)):
    temp = name[i].split(' - ')
    if len(temp) == 2:
        data[i] = data[i].replace(temp[0], '')
        data[i] = data[i].replace(temp[1], '')
del name

In [None]:
punctuation = string.punctuation + '“”'
punctuation

In [None]:
def normalize(text):
    index = text.find("Mục lục")
    # Nếu tìm thấy "Mục lục", cắt bỏ từ vị trí đó
    if index != -1: text = text[:index]
    
    # remove html
    text = re.sub(r'<[^>]*>', '', text)
    
    # remove http
    text = re.sub(r'Nguồn:\s*http?:\/\/\S+', '', text)
    text = re.sub(r'http\S*', '', text)
    
    # lowercase
    text = text.lower()
    
    # remove punctuation
    text = re.sub(f'[{punctuation}₫—℅\d\n\t]', ' ', text)
    
    # remove white space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
data = [normalize(text) for text in data]

In [None]:
data[0][:500]

In [None]:
tokenizer = Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(data)

In [None]:
vocab_size = len(tokenizer.word_index)
vocab_size

In [None]:
data = tokenizer.texts_to_sequences(data)

In [None]:
gc.collect()

In [None]:
X = []
y = []
sequence_length = 50
step = 25
for encoded_text in tqdm(data):
    for i in range(0, len(encoded_text) - sequence_length, step):
        X.append(encoded_text[i: i + sequence_length])
        y.append(encoded_text[i + sequence_length])

In [None]:
len(X)

In [None]:
# X = np.array(X)
# y = to_categorical(y, num_classes=vocab_size)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
from keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, X, y, batch_size):
        self.X = X
        self.y = y
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, idx):
        batch_X = self.X[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return np.array(batch_X), np.array(batch_y)

In [None]:
batch_size = 256
train_generator = DataGenerator(X_train, y_train, batch_size=batch_size)
val_generator = DataGenerator(X_val, y_val, batch_size=batch_size)

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=100),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(256),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.1,
                              patience=10,
                              min_lr=1e-6,
                              verbose=1)
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=10,
                               restore_best_weights=True,
                               verbose=1)
callbacks = [reduce_lr, early_stopping]

In [None]:
history = model.fit(train_generator,
                    validation_data=val_generator,
                    epochs=100,
                    callbacks=callbacks)

In [None]:
import pickle

model.save('./text_generation_model.h5')

with open('./tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('./sequences_digit.pkl', 'wb') as f:
    pickle.dump(data, f)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()