In [1]:
import re
import json
import random
from datetime import datetime
from pathlib import Path
from collections import Counter

import numpy as np
from tqdm.notebook import trange
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import TextVectorization, LSTM, Embedding, Dropout, Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [2]:
gpu_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.get_device_details(gpu_devices[0])

2022-07-26 18:34:37.915500: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 18:34:37.924748: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 18:34:37.925227: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 18:34:37.928333: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


{'compute_capability': (7, 5), 'device_name': 'NVIDIA GeForce GTX 1650'}

In [3]:
!nvidia-smi

Tue Jul 26 18:34:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57       Driver Version: 516.59       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| N/A   59C    P0    12W /  N/A |    186MiB /  4096MiB |     18%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
RANDOM_STATE = 7
SEQ_LEN = 512
VAL_SIZE = 0.05
EPOCHS = 10
BATCH_SIZE = 64
LR = 1e-3
SHUFFLE_BUFFER = 1_000
EMBEDDING_DIM = 32
TRAIN_STEPS = 25
VAL_STEPS = 10

TB_LOGS = Path("tb_logs/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
TB_LOGS.mkdir(exist_ok = True, parents = True)

In [5]:
X = []
y = []

books = Path("../Data/Text/Sherlock_Holmes/").rglob("*.txt")

for book in books:
    with book.open('r', encoding = 'utf-8') as book_file:
        book_data = book_file.read()
        book_data = re.sub("[ ]+", " ", book_data)
        char_len = len(book_data)

        for i in range(0, char_len - SEQ_LEN):
            X.append(book_data[i : i + SEQ_LEN])
            y.append(book_data[i + SEQ_LEN])

for i in np.random.randint(0, len(X), 5):
    print(f'Input: {X[i]!r}')
    print(f'Output: {y[i]}\n')

Input: 'hat you have not slept for a night or two,” said\n Holmes, in his easy, genial way. “That tries a man’s nerves more\n than work, and more even than pleasure. May I ask how I can help\n you?”\n\n “I wanted your advice, sir. I don’t know what to do and my whole\n life seems to have gone to pieces.”\n\n “You wish to employ me as a consulting detective?”\n\n “Not that only. I want your opinion as a judicious man—as a man\n of the world. I want to know what I ought to do next. I hope to\n God you’ll be able to tell me.”\n\n H'
Output: e

Input: 'notebook, broke his pencil, had to borrow one\n from our host and finally borrowed a knife to sharpen his own.\n The same curious accident happened to him in the rooms of the\n Indian—a silent, little, hook-nosed fellow, who eyed us askance,\n and was obviously glad when Holmes’s architectural studies had\n come to an end. I could not see that in either case Holmes had\n come upon the clue for which he was searching. Only at the third\n did 

In [6]:
len(X), len(y)

(1762106, 1762106)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = VAL_SIZE, random_state = RANDOM_STATE)

len(X_train), len(y_train), len(X_test), len(y_test)

(1674000, 1674000, 88106, 88106)

In [8]:
char_freq_dict = Counter(y_train)
char_freq_dict.most_common(20)

[(' ', 295464),
 ('e', 155851),
 ('t', 112814),
 ('a', 101599),
 ('o', 98853),
 ('n', 83839),
 ('h', 80451),
 ('i', 77714),
 ('s', 77639),
 ('r', 72665),
 ('d', 53827),
 ('l', 48896),
 ('u', 38653),
 ('\n', 35880),
 ('m', 32644),
 ('c', 31278),
 ('w', 30752),
 ('f', 26027),
 ('y', 25655),
 ('g', 22287)]

In [9]:
del(X)
del(y)
del(book_data)

In [10]:
%%time

vectorizer = TextVectorization(standardize = None, split = "character", name = 'TextVectorizer')
vocab_json = Path("vocab.json")

if vocab_json.exists():
    with vocab_json.open("r") as vocab_file:
        vocab = json.load(vocab_file)["vocab"]
    
    vectorizer.set_vocabulary(vocab)
else:
    vectorizer.adapt(X_train)
    vocab = vectorizer.get_vocabulary()[2:]

    with vocab_json.open("w") as vocab_file:
        json.dump({"vocab": vocab}, vocab_file)

vocab = vectorizer.get_vocabulary()
char_count = len(vocab)
char_count

2022-07-26 18:34:43.044102: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-26 18:34:43.045408: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 18:34:43.045978: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 18:34:43.046523: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built witho

CPU times: user 774 ms, sys: 201 ms, total: 974 ms
Wall time: 1.35 s


107

In [11]:
class_weight_dict = {}
total_freq = sum([v for v in char_freq_dict.values()])

for i, v in enumerate(vocab):
    freq = char_freq_dict.get(v, 1)
    class_weight_dict[i] = min(round(total_freq / (freq * 100), 3), 1_000.0)

class_weight_dict

{0: 1000.0,
 1: 1000.0,
 2: 0.057,
 3: 0.107,
 4: 0.148,
 5: 0.165,
 6: 0.169,
 7: 0.2,
 8: 0.208,
 9: 0.215,
 10: 0.216,
 11: 0.23,
 12: 0.311,
 13: 0.342,
 14: 0.433,
 15: 0.467,
 16: 0.513,
 17: 0.535,
 18: 0.544,
 19: 0.643,
 20: 0.653,
 21: 0.751,
 22: 0.764,
 23: 0.838,
 24: 0.905,
 25: 0.972,
 26: 1.32,
 27: 1.586,
 28: 1.613,
 29: 2.313,
 30: 2.744,
 31: 4.168,
 32: 4.579,
 33: 5.426,
 34: 6.68,
 35: 7.578,
 36: 7.547,
 37: 7.578,
 38: 7.754,
 39: 8.071,
 40: 8.792,
 41: 11.593,
 42: 12.73,
 43: 14.827,
 44: 14.92,
 45: 15.0,
 46: 15.572,
 47: 15.837,
 48: 16.607,
 49: 16.707,
 50: 16.558,
 51: 18.078,
 52: 19.175,
 53: 21.083,
 54: 21.684,
 55: 23.09,
 56: 23.218,
 57: 26.614,
 58: 36.471,
 59: 40.24,
 60: 47.557,
 61: 49.97,
 62: 56.746,
 63: 71.234,
 64: 82.463,
 65: 91.978,
 66: 107.308,
 67: 104.625,
 68: 153.578,
 69: 209.25,
 70: 214.615,
 71: 223.2,
 72: 229.315,
 73: 220.263,
 74: 229.315,
 75: 249.851,
 76: 298.929,
 77: 293.684,
 78: 304.364,
 79: 310.0,
 80: 321.923

In [12]:
%%time

y_train = vectorizer(y_train).numpy().flatten()
y_test = vectorizer(y_test).numpy().flatten()
y_train.shape, y_test.shape

CPU times: user 6.71 s, sys: 93.8 ms, total: 6.8 s
Wall time: 6.77 s


((1674000,), (88106,))

In [13]:
train_ds = Dataset.from_tensor_slices((X_train, y_train)).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(AUTOTUNE)
train_ds, len(train_ds)

(<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>,
 26157)

In [14]:
val_ds = Dataset.from_tensor_slices((X_test, y_test)).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds, len(val_ds)

(<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>,
 1377)

In [15]:
def get_lstm_model(char_count: int, embedding_dim: int = 32):
    input_layer = tf.keras.Input(shape = (1,), dtype = tf.string, name = 'Input')

    vectorizer_layer = vectorizer(input_layer)
    embedding_layer = Embedding(char_count + 1, embedding_dim, name = 'EmbeddingLayer')(vectorizer_layer)

    lstm_1 = LSTM(128, return_sequences = True, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_1')(embedding_layer)
    lstm_2 = LSTM(128, return_sequences = True, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_2')(lstm_1)
    lstm_3 = LSTM(128, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_3')(lstm_2)

    dense_1 = Dense(128, name = 'Dense_1')(lstm_3)
    lr_1 = LeakyReLU(name = 'LR_1')(dense_1)
    dropout_1 = Dropout(0.1, name = 'Dropout_1')(lr_1)

    dense_2 = Dense(128, name = 'Dense_2')(dropout_1)
    lr_2 = LeakyReLU(name = 'LR_2')(dense_2)
    dropout_2 = Dropout(0.1, name = 'Dropout_2')(lr_2)

    dense_3 = Dense(128, name = 'Dense_3')(dropout_2)
    lr_3 = LeakyReLU(name = 'LR_3')(dense_3)

    output_layer = Dense(char_count, activation = 'softmax', name = "Output")(lr_3)

    model = tf.keras.Model(inputs  = input_layer, outputs = output_layer, name = 'Text_Generation_Model')
    model.compile(optimizer = Adam(LR), loss = 'sparse_categorical_crossentropy', metrics = ['sparse_categorical_accuracy'])
    return model

model = get_lstm_model(char_count, EMBEDDING_DIM)
model.summary()

Model: "Text_Generation_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 1)]               0         
                                                                 
 TextVectorizer (TextVectori  (None, None)             0         
 zation)                                                         
                                                                 
 EmbeddingLayer (Embedding)  (None, None, 32)          3456      
                                                                 
 LSTM_1 (LSTM)               (None, None, 128)         82432     
                                                                 
 LSTM_2 (LSTM)               (None, None, 128)         131584    
                                                                 
 LSTM_3 (LSTM)               (None, 128)               131584    
                                             

In [16]:
%%time

tensorboard = TensorBoard(log_dir = str(TB_LOGS))

history = model.fit(
        train_ds, 
        validation_data = val_ds, 
        epochs = EPOCHS, 
        steps_per_epoch = TRAIN_STEPS, 
        validation_steps = VAL_STEPS, 
        class_weight = class_weight_dict, 
        callbacks = [tensorboard]
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1h 45min 34s, sys: 1h 21min 27s, total: 3h 7min 2s
Wall time: 1h 21min 39s


In [17]:
model.evaluate(val_ds.take(100))



[4.396735191345215, 0.0026562500279396772]

In [18]:
%%time

sample_input = random.choice(X_train)
print(f"Input:\n{sample_input}")

pred_output = ''

for i in trange(100, desc = "Predicting chars", unit = " char"):
    pred = model.predict([sample_input], verbose = False)
    pred_char_id = pred.argmax()
    pred_char = vocab[pred_char_id]
    pred_output += pred_char
    sample_input = sample_input[1:] + pred_char

print(f"Output:\n{pred_output}")

Input:
d it be?”

 “That is what Mr. Hilton Cubitt, of Riding Thorpe Manor, Norfolk,
 is very anxious to know. This little conundrum came by the first
 post, and he was to follow by the next train. There’s a ring at
 the bell, Watson. I should not be very much surprised if this
 were he.”

 A heavy step was heard upon the stairs, and an instant later
 there entered a tall, ruddy, clean-shaven gentleman, whose clear
 eyes and florid cheeks told of a life led far from the fogs of
 Baker Street. He seemed to bring a 


Predicting chars:   0%|          | 0/100 [00:00<?, ? char/s]

Output:
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CPU times: user 1min 18s, sys: 7.62 s, total: 1min 26s
Wall time: 2min 11s
