In [1]:
import re
import json
import random
from datetime import datetime
from pathlib import Path
from collections import Counter

import numpy as np
from tqdm.notebook import trange
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import TextVectorization, LSTM, Embedding, Dropout, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [2]:
gpu_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.get_device_details(gpu_devices[0])

2022-07-31 07:26:57.456693: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 07:26:57.534588: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 07:26:57.535052: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 07:26:57.540748: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


{'compute_capability': (7, 5), 'device_name': 'NVIDIA GeForce GTX 1650'}

In [3]:
!nvidia-smi

Sun Jul 31 07:26:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57       Driver Version: 516.59       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   54C    P8     2W /  N/A |      0MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [19]:
RANDOM_STATE = 7
SEQ_LEN = 512
VAL_SIZE = 0.05
EPOCHS = 10
BATCH_SIZE = 128
LR = 5e-4
SHUFFLE_BUFFER = BATCH_SIZE * 20
EMBEDDING_DIM = 32
DROPOUT_RATIO = 0.2
TRAIN_STEPS = 2500
VAL_STEPS = 250
EARLY_STOP_PATIENCE = 3

TB_LOGS = Path("tb_logs/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
TB_LOGS.mkdir(exist_ok = True, parents = True)

MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok = True, parents = True)

In [5]:
X = []
y = []

books = Path("../Data/Text/Sherlock_Holmes/").rglob("*.txt")

for book in books:
    with book.open('r', encoding = 'utf-8') as book_file:
        book_data = book_file.read()
        book_data = re.sub("[ ]+", " ", book_data)
        char_len = len(book_data)

        for i in range(0, char_len - SEQ_LEN):
            X.append(book_data[i : i + SEQ_LEN])
            y.append(book_data[i + SEQ_LEN])

for i in np.random.randint(0, len(X), 5):
    print(f'Input: {X[i]!r}')
    print(f'Output: {y[i]}\n')

Input: ' it was\n for Lady Hilda Trelawney Hope that Sherlock Holmes inquired. We\n were shown into the morning-room.\n\n “Mr. Holmes!” said the lady, and her face was pink with her\n indignation. “This is surely most unfair and ungenerous upon your\n part. I desired, as I have explained, to keep my visit to you a\n secret, lest my husband should think that I was intruding into\n his affairs. And yet you compromise me by coming here and so\n showing that there are business relations between us.”\n\n “Unfortunately, madam, I h'
Output: a

Input: 'ghing; “but since, as you said\njust now, there has been no crime committed, and no harm done save the\nloss of a goose, all this seems to be rather a waste of energy.”\n\nSherlock Holmes had opened his mouth to reply, when the door flew open,\nand Peterson, the commissionaire, rushed into the apartment with\nflushed cheeks and the face of a man who is dazed with astonishment.\n\n“The goose, Mr. Holmes! The goose, sir!” he gasped.\n\n“Eh? What 

In [6]:
len(X), len(y)

(1762106, 1762106)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = VAL_SIZE, random_state = RANDOM_STATE)

len(X_train), len(y_train), len(X_test), len(y_test)

(1674000, 1674000, 88106, 88106)

In [8]:
char_freq_dict = Counter(y_train)
char_freq_dict.most_common(20)

[(' ', 295464),
 ('e', 155851),
 ('t', 112814),
 ('a', 101599),
 ('o', 98853),
 ('n', 83839),
 ('h', 80451),
 ('i', 77714),
 ('s', 77639),
 ('r', 72665),
 ('d', 53827),
 ('l', 48896),
 ('u', 38653),
 ('\n', 35880),
 ('m', 32644),
 ('c', 31278),
 ('w', 30752),
 ('f', 26027),
 ('y', 25655),
 ('g', 22287)]

In [9]:
del(X)
del(y)
del(book_data)

In [10]:
%%time

vectorizer = TextVectorization(standardize = None, split = "character", name = 'TextVectorizer')
vocab_json = Path("vocab.json")

if vocab_json.exists():
    with vocab_json.open("r") as vocab_file:
        vocab = json.load(vocab_file)["vocab"]
    
    vectorizer.set_vocabulary(vocab)
else:
    vectorizer.adapt(X_train)
    vocab = vectorizer.get_vocabulary()[2:]

    with vocab_json.open("w") as vocab_file:
        json.dump({"vocab": vocab}, vocab_file)

vocab = vectorizer.get_vocabulary()
char_count = len(vocab)
char_count

CPU times: user 793 ms, sys: 565 ms, total: 1.36 s
Wall time: 2.22 s


2022-07-31 07:27:04.843743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-31 07:27:04.846745: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 07:27:04.847271: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 07:27:04.847727: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built witho

107

In [11]:
class_weight_dict = {}
total_freq = sum([v for v in char_freq_dict.values()])

for i, v in enumerate(vocab):
    freq = char_freq_dict.get(v, 1)
    class_weight_dict[i] = round(np.sqrt(total_freq / (freq * char_count)), 2)

class_weight_dict

{0: 125.08,
 1: 125.08,
 2: 0.23,
 3: 0.32,
 4: 0.37,
 5: 0.39,
 6: 0.4,
 7: 0.43,
 8: 0.44,
 9: 0.45,
 10: 0.45,
 11: 0.46,
 12: 0.54,
 13: 0.57,
 14: 0.64,
 15: 0.66,
 16: 0.69,
 17: 0.71,
 18: 0.71,
 19: 0.78,
 20: 0.78,
 21: 0.84,
 22: 0.85,
 23: 0.89,
 24: 0.92,
 25: 0.95,
 26: 1.11,
 27: 1.22,
 28: 1.23,
 29: 1.47,
 30: 1.6,
 31: 1.97,
 32: 2.07,
 33: 2.25,
 34: 2.5,
 35: 2.66,
 36: 2.66,
 37: 2.66,
 38: 2.69,
 39: 2.75,
 40: 2.87,
 41: 3.29,
 42: 3.45,
 43: 3.72,
 44: 3.73,
 45: 3.74,
 46: 3.81,
 47: 3.85,
 48: 3.94,
 49: 3.95,
 50: 3.93,
 51: 4.11,
 52: 4.23,
 53: 4.44,
 54: 4.5,
 55: 4.65,
 56: 4.66,
 57: 4.99,
 58: 5.84,
 59: 6.13,
 60: 6.67,
 61: 6.83,
 62: 7.28,
 63: 8.16,
 64: 8.78,
 65: 9.27,
 66: 10.01,
 67: 9.89,
 68: 11.98,
 69: 13.98,
 70: 14.16,
 71: 14.44,
 72: 14.35,
 73: 14.64,
 74: 14.64,
 75: 15.28,
 76: 16.71,
 77: 16.57,
 78: 16.87,
 79: 17.02,
 80: 17.35,
 81: 20.03,
 82: 25.02,
 83: 25.53,
 84: 26.67,
 85: 27.97,
 86: 39.55,
 87: 44.22,
 88: 44.22,
 89: 51.0

In [12]:
%%time

y_train = vectorizer(y_train).numpy().flatten()
y_test = vectorizer(y_test).numpy().flatten()
y_train.shape, y_test.shape

CPU times: user 6.54 s, sys: 192 ms, total: 6.74 s
Wall time: 6.76 s


((1674000,), (88106,))

In [13]:
train_ds = Dataset.from_tensor_slices((X_train, y_train)).shuffle(SHUFFLE_BUFFER).repeat().batch(BATCH_SIZE).prefetch(AUTOTUNE)
train_ds

<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [14]:
val_ds = Dataset.from_tensor_slices((X_test, y_test)).shuffle(SHUFFLE_BUFFER).repeat().batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds

<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [15]:
def get_lstm_model(char_count: int, embedding_dim: int = 32):
    input_layer = tf.keras.Input(shape = (1,), dtype = tf.string, name = 'Input')

    vectorizer_layer = vectorizer(input_layer)
    embedding_layer = Embedding(char_count + 1, embedding_dim, name = 'EmbeddingLayer')(vectorizer_layer)

    lstm_1 = LSTM(512, return_sequences = True, dropout = DROPOUT_RATIO, name = 'LSTM_1')(embedding_layer)
    lstm_2 = LSTM(256, dropout = DROPOUT_RATIO, name = 'LSTM_2')(lstm_1)
    bn_1 = BatchNormalization(name = 'BN_1')(lstm_2)

    dense_1 = Dense(256, name = 'Dense_1')(bn_1)
    lr_1 = LeakyReLU(name = 'LR_1')(dense_1)
    dropout_1 = Dropout(DROPOUT_RATIO, name = 'Dropout_1')(lr_1)

    dense_2 = Dense(128, name = 'Dense_2')(dropout_1)
    lr_2 = LeakyReLU(name = 'LR_2')(dense_2)
    dropout_2 = Dropout(DROPOUT_RATIO, name = 'Dropout_2')(lr_2)
    bn_2 = BatchNormalization(name = 'BN_2')(dropout_2)

    dense_3 = Dense(128, name = 'Dense_3')(bn_2)
    lr_3 = LeakyReLU(name = 'LR_3')(dense_3)

    output_layer = Dense(char_count, activation = 'softmax', name = "Output")(lr_3)

    model = tf.keras.Model(inputs  = input_layer, outputs = output_layer, name = 'Text_Generation_Model')
    model.compile(optimizer = Adam(LR), loss = 'sparse_categorical_crossentropy', metrics = ['sparse_categorical_accuracy'])
    return model

model = get_lstm_model(char_count, EMBEDDING_DIM)
model.summary()

Model: "Text_Generation_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 1)]               0         
                                                                 
 TextVectorizer (TextVectori  (None, None)             0         
 zation)                                                         
                                                                 
 EmbeddingLayer (Embedding)  (None, None, 32)          3456      
                                                                 
 LSTM_1 (LSTM)               (None, None, 512)         1116160   
                                                                 
 LSTM_2 (LSTM)               (None, 256)               787456    
                                                                 
 BN_1 (BatchNormalization)   (None, 256)               1024      
                                             

In [16]:
%%time

earlystop = EarlyStopping(patience = EARLY_STOP_PATIENCE, restore_best_weights = True)
tensorboard = TensorBoard(log_dir = str(TB_LOGS))

history = model.fit(
        train_ds,
        validation_data = val_ds,
        epochs = EPOCHS,
        steps_per_epoch = TRAIN_STEPS,
        validation_steps = VAL_STEPS,
        class_weight = class_weight_dict,
        callbacks = [earlystop, tensorboard]
    )

Epoch 1/10


2022-07-31 07:27:39.130986: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 2h 27min 43s, sys: 1h 8min 54s, total: 3h 36min 38s
Wall time: 3h 38min 42s


In [17]:
model.evaluate(val_ds.take(500))



[1.9762729406356812, 0.4148281216621399]

In [18]:
%%time

sample_input = random.choice(X_train)
print(f"Input:\n{sample_input}")

pred_output = ''

for i in trange(100, desc = "Predicting chars", unit = " char"):
    pred = model.predict([sample_input], verbose = False)
    pred_char_id = pred.argmax()
    pred_char = vocab[pred_char_id]
    pred_output += pred_char
    sample_input = sample_input[1:] + pred_char

print(f"Output:\n{pred_output}")

Input:
. There must have been several in it, and they must have been men
of resource and determination. Their papers they mean to have, be the
holder of them who it may. In this way you see K. K. K. ceases to be
the initials of an individual and becomes the badge of a society.”

“But of what society?”

“Have you never—” said Sherlock Holmes, bending forward and sinking his
voice—“have you never heard of the Ku Klux Klan?”

“I never have.”

Holmes turned over the leaves of the book upon his knee. “Here it is,”
said


Predicting chars:   0%|          | 0/100 [00:00<?, ? char/s]

Output:
 Holmes; “Which I was may the prack, which I was sucked the project Gutenbed.”

“And the Carmer.”

“
CPU times: user 29.4 s, sys: 2.3 s, total: 31.7 s
Wall time: 30.4 s


In [20]:
model.save(MODELS_DIR)



INFO:tensorflow:Assets written to: models/assets


INFO:tensorflow:Assets written to: models/assets
