In [1]:
import re
import json
import random
from pathlib import Path
from collections import Counter

import numpy as np
from tqdm.notebook import trange
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import TextVectorization, LSTM, Embedding, Dropout, Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam

In [2]:
gpu_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.get_device_details(gpu_devices[0])

2022-07-26 17:05:53.719110: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 17:05:53.727257: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 17:05:53.727707: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 17:05:53.729131: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


{'compute_capability': (7, 5), 'device_name': 'NVIDIA GeForce GTX 1650'}

In [3]:
!nvidia-smi

Tue Jul 26 17:05:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57       Driver Version: 516.59       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| N/A   54C    P5     6W /  N/A |    162MiB /  4096MiB |     27%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
RANDOM_STATE = 7
SEQ_LEN = 512
VAL_SIZE = 0.05
EPOCHS = 2
BATCH_SIZE = 64
LR = 1e-3
SHUFFLE_BUFFER = 1_000
EMBEDDING_DIM = 32

In [5]:
X = []
y = []

books = Path("../Data/Text/Sherlock_Holmes/").rglob("*.txt")

for book in books:
    with book.open('r', encoding = 'utf-8') as book_file:
        book_data = book_file.read()
        book_data = re.sub("[ ]+", " ", book_data)
        char_len = len(book_data)

        for i in range(0, char_len - SEQ_LEN):
            X.append(book_data[i : i + SEQ_LEN])
            y.append(book_data[i + SEQ_LEN])

for i in np.random.randint(0, len(X), 5):
    print(f'Input: {X[i]!r}')
    print(f'Output: {y[i]}\n')

Input: 'ber right, you\n had not heard the name of Professor James Moriarty, who had one\n of the great brains of the century. Just give me down my index of\n biographies from the shelf.”\n\n He turned over the pages lazily, leaning back in his chair and\n blowing great clouds from his cigar.\n\n “My collection of M’s is a fine one,” said he. “Moriarty himself\n is enough to make any letter illustrious, and here is Morgan the\n poisoner, and Merridew of abominable memory, and Mathews, who\n knocked out my left canine in the w'
Output: a

Input: 'ommitted an\nindiscretion.”\n\n“I was mad—insane.”\n\n“You have compromised yourself seriously.”\n\n“I was only Crown Prince then. I was young. I am but thirty now.”\n\n“It must be recovered.”\n\n“We have tried and failed.”\n\n“Your Majesty must pay. It must be bought.”\n\n“She will not sell.”\n\n“Stolen, then.”\n\n“Five attempts have been made. Twice burglars in my pay ransacked her\nhouse. Once we diverted her luggage when she travelled. Twic

In [6]:
len(X), len(y)

(1762106, 1762106)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = VAL_SIZE, random_state = RANDOM_STATE)

len(X_train), len(y_train), len(X_test), len(y_test)

(1674000, 1674000, 88106, 88106)

In [8]:
char_freq_dict = Counter(y_train)
char_freq_dict.most_common(20)

[(' ', 295464),
 ('e', 155851),
 ('t', 112814),
 ('a', 101599),
 ('o', 98853),
 ('n', 83839),
 ('h', 80451),
 ('i', 77714),
 ('s', 77639),
 ('r', 72665),
 ('d', 53827),
 ('l', 48896),
 ('u', 38653),
 ('\n', 35880),
 ('m', 32644),
 ('c', 31278),
 ('w', 30752),
 ('f', 26027),
 ('y', 25655),
 ('g', 22287)]

In [9]:
del(X)
del(y)
del(book_data)

In [10]:
%%time

vectorizer = TextVectorization(standardize = None, split = "character", name = 'TextVectorizer')
vocab_json = Path("vocab.json")

if vocab_json.exists():
    with vocab_json.open("r") as vocab_file:
        vocab = json.load(vocab_file)["vocab"]
    
    vectorizer.set_vocabulary(vocab)
else:
    vectorizer.adapt(X_train)
    vocab = vectorizer.get_vocabulary()[2:]

    with vocab_json.open("w") as vocab_file:
        json.dump({"vocab": vocab}, vocab_file)

vocab = vectorizer.get_vocabulary()
char_count = len(vocab)
char_count

CPU times: user 726 ms, sys: 278 ms, total: 1 s
Wall time: 1.71 s


2022-07-26 17:05:59.231940: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-26 17:05:59.233694: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 17:05:59.234085: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-26 17:05:59.234471: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built witho

107

In [11]:
%%time

y_train = vectorizer(y_train).numpy().flatten()
y_test = vectorizer(y_test).numpy().flatten()
y_train.shape, y_test.shape

CPU times: user 6.56 s, sys: 48.5 ms, total: 6.61 s
Wall time: 6.6 s


((1674000,), (88106,))

In [12]:
train_ds = Dataset.from_tensor_slices((X_train, y_train)).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(AUTOTUNE)
train_ds, len(train_ds)

(<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>,
 26157)

In [13]:
val_ds = Dataset.from_tensor_slices((X_test, y_test)).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds, len(val_ds)

(<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>,
 1377)

In [14]:
def get_lstm_model(char_count: int, embedding_dim: int = 32):
    input_layer = tf.keras.Input(shape = (1,), dtype = tf.string, name = 'Input')

    vectorizer_layer = vectorizer(input_layer)
    embedding_layer = Embedding(char_count + 1, embedding_dim, name = 'EmbeddingLayer')(vectorizer_layer)

    lstm_1 = LSTM(128, return_sequences = True, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_1')(embedding_layer)
    lstm_2 = LSTM(128, return_sequences = True, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_2')(lstm_1)
    lstm_3 = LSTM(128, dropout = 0.05, recurrent_dropout = 0.05, name = 'LSTM_3')(lstm_2)

    dense_1 = Dense(128, name = 'Dense_1')(lstm_3)
    lr_1 = LeakyReLU(name = 'LR_1')(dense_1)
    dropout_1 = Dropout(0.1, name = 'Dropout_1')(lr_1)

    dense_2 = Dense(128, name = 'Dense_2')(dropout_1)
    lr_2 = LeakyReLU(name = 'LR_2')(dense_2)
    dropout_2 = Dropout(0.1, name = 'Dropout_2')(lr_2)

    dense_3 = Dense(128, name = 'Dense_3')(dropout_2)
    lr_3 = LeakyReLU(name = 'LR_3')(dense_3)

    output_layer = Dense(char_count, activation = 'softmax', name = "Output")(lr_3)

    model = tf.keras.Model(inputs  = input_layer, outputs = output_layer)
    model.compile(optimizer = Adam(LR), loss = 'sparse_categorical_crossentropy', metrics = ['sparse_categorical_accuracy'])
    return model

model = get_lstm_model(char_count, EMBEDDING_DIM)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 1)]               0         
                                                                 
 TextVectorizer (TextVectori  (None, None)             0         
 zation)                                                         
                                                                 
 EmbeddingLayer (Embedding)  (None, None, 32)          3456      
                                                                 
 LSTM_1 (LSTM)               (None, None, 128)         82432     
                                                                 
 LSTM_2 (LSTM)               (None, None, 128)         131584    
                                                                 
 LSTM_3 (LSTM)               (None, 128)               131584    
                                                             

In [15]:
%%time

history = model.fit(train_ds, validation_data = val_ds, epochs = EPOCHS, steps_per_epoch = 10, validation_steps = 5)

Epoch 1/2
Epoch 2/2
CPU times: user 9min 26s, sys: 6min 26s, total: 15min 53s
Wall time: 6min 57s


In [16]:
model.evaluate(val_ds.take(100))



[3.257868766784668, 0.17859375476837158]

In [17]:
%%time

sample_input = random.choice(X_train)
print(f"Input:\n{sample_input}")

pred_output = ''

for i in trange(50, desc = "Predicting chars", unit = " chars"):
    pred = model.predict([sample_input], verbose = False)
    pred_char_id = pred.argmax()
    pred_char = vocab[pred_char_id]
    pred_output += pred_char
    sample_input = sample_input[1:] + pred_char

print(f"Output:\n{pred_output}")

Input:
could only be a seaman who had been with him on
 the _Sea Unicorn_. So far as I could learn he had sailed in no
 other ship. I spent three days in wiring to Dundee, and at the
 end of that time I had ascertained the names of the crew of the
 _Sea Unicorn_ in 1883. When I found Patrick Cairns among the
 harpooners, my research was nearing its end. I argued that the
 man was probably in London, and that he would desire to leave the
 country for a time. I therefore spent some days in the East End,
 devised an 


Predicting chars:   0%|          | 0/50 [00:00<?, ? chars/s]

Output:
                                                  
CPU times: user 42.1 s, sys: 6.33 s, total: 48.4 s
Wall time: 1min 9s
