In [None]:
from pathlib import Path

import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import TextVectorization, LSTM, Embedding, Dropout, Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam

In [None]:
gpu_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.get_device_details(gpu_devices[0])

In [None]:
!nvidia-smi

In [None]:
RANDOM_STATE = 7
SEQ_LEN = 256
VAL_SIZE = 0.05
EPOCHS = 2
BATCH_SIZE = 40
LR = 1e-3
SHUFFLE_BUFFER = 1_000
EMBEDDING_DIM = 32

books = Path("../Data/Text/Sherlock_Holmes/").rglob("*.txt")

In [None]:
X = []
y = []

for book in books:
    with book.open('r', encoding = 'utf-8') as book_file:
        book_data = book_file.read()
        char_len = len(book_data)

        for i in range(0, char_len - SEQ_LEN):
            X.append(book_data[i : i + SEQ_LEN])
            y.append(book_data[i + SEQ_LEN])

for i in np.random.randint(0, len(X), 3):
    print(f'Input: {X[i]!r}')
    print(f'Output: {y[i]}\n')

In [None]:
len(X), len(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = VAL_SIZE, random_state = RANDOM_STATE)

len(X_train), len(y_train), len(X_test), len(y_test)

In [None]:
del(X)
del(y)
del(book_data)

In [None]:
%%time

vectorizer = TextVectorization(standardize = None, split = "character", name = 'TextVectorizer')
vectorizer.adapt(X_train)
char_count = vectorizer.vocabulary_size()
char_count

In [None]:
%%time

y_train = vectorizer(y_train).numpy().flatten()
y_test = vectorizer(y_test).numpy().flatten()
y_train.shape, y_test.shape

In [None]:
train_ds = Dataset.from_tensor_slices((X_train, y_train)).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(AUTOTUNE)
train_ds, len(train_ds)

In [None]:
val_ds = Dataset.from_tensor_slices((X_test, y_test)).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds, len(val_ds)

In [None]:
def get_lstm_model(char_count: int, embedding_dim: int = 32):
    input_layer = tf.keras.Input(shape = (1,), dtype = tf.string, name = 'Input')
    vectorizer_layer = vectorizer(input_layer)
    embedding_layer = Embedding(char_count + 1, embedding_dim, name = 'EmbeddingLayer')(vectorizer_layer)
    lstm_1 = LSTM(256, return_sequences = True, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_1')(embedding_layer)
    lstm_2 = LSTM(256, return_sequences = True, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_2')(lstm_1)
    lstm_3 = LSTM(256, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_3')(lstm_2)
    dense_1 = Dense(256, name = 'Dense_1')(lstm_3)
    lr_1 = LeakyReLU(name = 'LR_1')(dense_1)
    dropout = Dropout(0.1, name = 'Dropout')(lr_1)
    dense_2 = Dense(128, name = 'Dense_2')(dropout)
    lr_2 = LeakyReLU(name = 'LR_2')(dense_2)
    output_layer = Dense(char_count, activation = 'softmax', name = "Output")(lr_2)

    model = tf.keras.Model(inputs  = input_layer, outputs = output_layer)
    model.compile(optimizer = Adam(LR), loss = 'sparse_categorical_crossentropy', metrics = ['sparse_categorical_accuracy'])
    return model

model = get_lstm_model(char_count, EMBEDDING_DIM)
model.summary()

In [None]:
%%time

history = model.fit(train_ds, validation_data = val_ds, epochs = EPOCHS, steps_per_epoch = 100, validation_steps = 50)

In [None]:
model.evaluate(val_ds.take(100))