In [1]:
import json
from pathlib import Path
from collections import Counter

import numpy as np
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import TextVectorization, LSTM, Embedding, Dropout, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.train import Checkpoint, CheckpointManager

from model_utility import get_train_val_data, generate_text

In [2]:
tf.get_logger().setLevel('ERROR')

gpu_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.get_device_details(gpu_devices[0])

2022-07-31 16:20:52.512874: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 16:20:52.545029: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 16:20:52.545364: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 16:20:52.546230: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


{'compute_capability': (7, 5), 'device_name': 'NVIDIA GeForce GTX 1650'}

In [3]:
!nvidia-smi

Sun Jul 31 16:20:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57       Driver Version: 516.59       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   51C    P0    12W /  N/A |      0MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
RANDOM_STATE = 7
SEQ_LEN = 512
VAL_SIZE = 0.05
EPOCHS = 10
BATCH_SIZE = 128
LR = 1e-3
SHUFFLE_BUFFER = BATCH_SIZE * 20
EMBEDDING_DIM = 32
DROPOUT_RATIO = 0.2
TRAIN_STEPS = 4000
EARLY_STOP_PATIENCE = 5
CHARS_TO_PREDICT = 256
MODEL_IDENTIFIER = "V1"

TB_LOGS = Path("tb_logs/" + MODEL_IDENTIFIER)
TB_LOGS.mkdir(exist_ok = True, parents = True)

MODELS_DIR = Path("models/" + MODEL_IDENTIFIER)
MODELS_DIR.mkdir(exist_ok = True, parents = True)

CKPT_DIR = Path("ckpt/" + MODEL_IDENTIFIER)
CKPT_DIR.mkdir(exist_ok = True, parents = True)

In [5]:
X_train, X_val, y_train, y_val = get_train_val_data(
    book_dir = Path("../Data/Text/Sherlock_Holmes/"),
    file_pat = "*.txt",
    seq_len = SEQ_LEN,
    val_size = VAL_SIZE,
    random_state = RANDOM_STATE
)

len(X_train), len(y_train), len(X_val), len(y_val)

(1647675, 1647675, 86720, 86720)

In [6]:
print("Sample Training data\n")

for i in np.random.randint(len(X_train), size = (5)):
    print(f"Input: {X_train[i]!r}")
    print(f"Output: {y_train[i]!r}\n")

Sample Training data

Input: 't to my illustrious client. “‘You doubt its value?’ he asked. “‘Not at all. I only doubt—’ “‘The propriety of my leaving it. You may set your mind at rest about that. I should not dream of doing so were it not absolutely certain that I should be able in four days to reclaim it. It is a pure matter of form. Is the security sufficient?’ “‘Ample.’ “‘You understand, Mr. Holder, that I am giving you a strong proof of the confidence which I have in you, founded upon all that I have heard of you. I rely upon you n'
Output: 'o'

Input: 'uble-edged weapon now. The chances are that she would be as averse to its being seen by Mr. Godfrey Norton, as our client is to its coming to the eyes of his princess. Now the question is, Where are we to find the photograph?” “Where, indeed?” “It is most unlikely that she carries it about with her. It is cabinet size. Too large for easy concealment about a woman’s dress. She knows that the King is capable of having her waylaid and

In [7]:
print("Sample Validation data\n")

for i in np.random.randint(len(X_val), size = (5)):
    print(f"Input: {X_val[i]!r}")
    print(f"Output: {y_val[i]!r}\n")

Sample Validation data

Input: 'pology to that noble lad, your son, who has carried himself in this matter as I should be proud to see my own son do, should I ever chance to have one.” “Then it was not Arthur who took them?” “I told you yesterday, and I repeat to-day, that it was not.” “You are sure of it! Then let us hurry to him at once to let him know that the truth is known.” “He knows it already. When I had cleared it all up I had an interview with him, and finding that he would not tell me the story, I told it to him, on which he ha'
Output: 'd'

Input: 'as wonderfully like a tiger himself. “I wonder that my very simple stratagem could deceive so old a _shikari_,” said Holmes. “It must be very familiar to you. Have you not tethered a young kid under a tree, lain above it with your rifle, and waited for the bait to bring up your tiger? This empty house is my tree, and you are my tiger. You have possibly had other guns in reserve in case there should be several tigers, or in the un

In [8]:
char_freq_dict = Counter(y_train)
char_freq_dict.most_common(20)

[(' ', 304968),
 ('e', 156016),
 ('t', 112813),
 ('a', 101601),
 ('o', 98866),
 ('n', 83866),
 ('h', 80282),
 ('i', 77695),
 ('s', 77640),
 ('r', 72685),
 ('d', 53723),
 ('l', 48901),
 ('u', 38652),
 ('m', 32673),
 ('c', 31331),
 ('w', 30771),
 ('f', 26089),
 ('y', 25704),
 ('g', 22231),
 (',', 21892)]

In [9]:
%%time

vectorizer = TextVectorization(standardize = None, split = "character", name = 'TextVectorizer')
vocab_json = Path("vocab.json")

if vocab_json.exists():
    with vocab_json.open("r") as vocab_file:
        vocab = json.load(vocab_file)["vocab"]
    
    vectorizer.set_vocabulary(vocab)
else:
    vectorizer.adapt(X_train)
    vocab = vectorizer.get_vocabulary()[2:]

    with vocab_json.open("w") as vocab_file:
        json.dump({"vocab": vocab}, vocab_file)

vocab = vectorizer.get_vocabulary()
char_count = len(vocab)
char_count

2022-07-31 16:20:57.820852: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-31 16:20:57.823018: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 16:20:57.823584: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-31 16:20:57.824009: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built witho

CPU times: user 863 ms, sys: 280 ms, total: 1.14 s
Wall time: 2.03 s


106

In [10]:
class_weight_dict = {}
total_freq = sum([v for v in char_freq_dict.values()])

print(f'{"Class ID":12}{"Char":10}{"Freq":10}{"Class weight"}')
for i, v in enumerate(vocab):
    freq = char_freq_dict.get(v, 1)
    class_weight_dict[i] = round(np.sqrt(total_freq / (freq * char_count)), 2)
    print(f"{i:^12}{v!r:^10}{freq:^10}{class_weight_dict[i]:^12}")

Class ID    Char      Freq      Class weight
     0          ''        1        124.68   
     1       '[UNK]'      1        124.68   
     2         ' '      304968      0.23    
     3         'e'      156016      0.32    
     4         't'      112813      0.37    
     5         'a'      101601      0.39    
     6         'o'      98866       0.4     
     7         'n'      83866       0.43    
     8         'h'      80282       0.44    
     9         'i'      77695       0.45    
     10        's'      77640       0.45    
     11        'r'      72685       0.46    
     12        'd'      53723       0.54    
     13        'l'      48901       0.56    
     14        'u'      38652       0.63    
     15        'm'      32673       0.69    
     16        'c'      31331       0.7     
     17        'w'      30771       0.71    
     18        'f'      26089       0.77    
     19        'y'      25704       0.78    
     20        'g'      22231       0.84    
     21   

In [11]:
y_train = vectorizer(y_train).numpy().flatten()
y_val = vectorizer(y_val).numpy().flatten()
y_train.shape, y_val.shape

((1647675,), (86720,))

In [12]:
train_ds = Dataset.from_tensor_slices((X_train, y_train)).shuffle(SHUFFLE_BUFFER).repeat().batch(BATCH_SIZE).prefetch(AUTOTUNE)
train_ds

<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [13]:
val_ds = Dataset.from_tensor_slices((X_val, y_val)).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds, val_ds.cardinality()

(<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>,
 <tf.Tensor: shape=(), dtype=int64, numpy=678>)

In [14]:
def get_lstm_model(char_count: int, embedding_dim: int = 32):
    input_layer = tf.keras.Input(shape = (1,), dtype = tf.string, name = 'Input')

    vectorizer_layer = vectorizer(input_layer)
    embedding_layer = Embedding(char_count + 1, embedding_dim, name = 'EmbeddingLayer')(vectorizer_layer)

    lstm_1 = LSTM(512, return_sequences = True, dropout = DROPOUT_RATIO, name = 'LSTM_1')(embedding_layer)
    lstm_2 = LSTM(256, dropout = DROPOUT_RATIO, name = 'LSTM_2')(lstm_1)
    bn_1 = BatchNormalization(name = 'BN_1')(lstm_2)

    dense_1 = Dense(256, name = 'Dense_1')(bn_1)
    lr_1 = LeakyReLU(name = 'LR_1')(dense_1)
    dropout_1 = Dropout(DROPOUT_RATIO, name = 'Dropout_1')(lr_1)

    dense_2 = Dense(128, name = 'Dense_2')(dropout_1)
    lr_2 = LeakyReLU(name = 'LR_2')(dense_2)
    dropout_2 = Dropout(DROPOUT_RATIO, name = 'Dropout_2')(lr_2)
    bn_2 = BatchNormalization(name = 'BN_2')(dropout_2)

    dense_3 = Dense(128, name = 'Dense_3')(bn_2)
    lr_3 = LeakyReLU(name = 'LR_3')(dense_3)

    output_layer = Dense(char_count, activation = 'softmax', name = "Output")(lr_3)

    model = tf.keras.Model(inputs  = input_layer, outputs = output_layer, name = 'Text_Generation_Model')
    model.compile(optimizer = Adam(LR), loss = 'sparse_categorical_crossentropy', metrics = ['sparse_categorical_accuracy'])
    return model

model = get_lstm_model(char_count, EMBEDDING_DIM)
model.summary()

Model: "Text_Generation_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 1)]               0         
                                                                 
 TextVectorizer (TextVectori  (None, None)             0         
 zation)                                                         
                                                                 
 EmbeddingLayer (Embedding)  (None, None, 32)          3424      
                                                                 
 LSTM_1 (LSTM)               (None, None, 512)         1116160   
                                                                 
 LSTM_2 (LSTM)               (None, 256)               787456    
                                                                 
 BN_1 (BatchNormalization)   (None, 256)               1024      
                                             

In [15]:
EPOCH_START = 0

checkpoint = Checkpoint(
    step = tf.Variable(EPOCH_START),
    model = model
)

ckpt_manager = CheckpointManager(checkpoint, CKPT_DIR, max_to_keep = 3)

if CKPT_DIR.joinpath("checkpoint").exists():
    checkpoint.restore(ckpt_manager.latest_checkpoint)
    EPOCH_START = checkpoint.step.numpy()

print(f"Starting training from Epoch {EPOCH_START}")

Starting training from Epoch 0


In [16]:
model.evaluate(val_ds)

2022-07-31 16:21:26.048809: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100




[4.663280010223389, 0.008371771313250065]

In [17]:
%%time

sample_input = X_train[np.random.randint(len(X_train))]
print(f"Input:\n{sample_input}")
print(f"Output:\n{generate_text(model, vocab, sample_input, CHARS_TO_PREDICT)}")

Input:
ile I sat opposite to him, and we listened in silence to the strange story which our visitor detailed to us. “You must know,” said he, “that I am an orphan and a bachelor, residing alone in lodgings in London. By profession I am a hydraulic engineer, and I have had considerable experience of my work during the seven years that I was apprenticed to Venner & Matheson, the well-known firm, of Greenwich. Two years ago, having served my time, and having also come into a fair sum of money through my poor father’s


Predicting chars:   0%|          | 0/256 [00:00<?, ? char/s]

Output:
ïïï:bbbbbbbbbbbbbbb111111ïïïïjbbbbbbbbbbbbbbbbbb111111ïïïïïbbbbbbbbbbbbbbbbbb111111ïïïïjbbbbbbbbbbbbbbbbbb111111ïïïïïbbbbbbbbbbbbbbbbbb111111ïïïïjbbbbbbbbbbbbbbbbbb111111ïïïïïbbbbbbbbbbbbbbbbbb111111ïïïïjbbbbbbbbbbbbbbbbbb111111ïïïïïbbbbbbbbbbbbbbbbbb11111
CPU times: user 28.8 s, sys: 6.44 s, total: 35.2 s
Wall time: 31.4 s


In [18]:
%%time

earlystop = EarlyStopping(patience = EARLY_STOP_PATIENCE, restore_best_weights = True)
tensorboard = TensorBoard(log_dir = str(TB_LOGS))

history = model.fit(
        train_ds,
        validation_data = val_ds,
        epochs = EPOCH_START + EPOCHS,
        steps_per_epoch = TRAIN_STEPS,
        class_weight = class_weight_dict,
        initial_epoch = EPOCH_START,
        callbacks = [earlystop, tensorboard]
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 4h 41s, sys: 1h 53min 39s, total: 5h 54min 20s
Wall time: 5h 58min 30s


In [19]:
model.evaluate(val_ds)



[1.6098037958145142, 0.5175046324729919]

In [20]:
%%time

print(f"Input:\n{sample_input}")
print(f"Output:\n{generate_text(model, vocab, sample_input, CHARS_TO_PREDICT)}")

Input:
ile I sat opposite to him, and we listened in silence to the strange story which our visitor detailed to us. “You must know,” said he, “that I am an orphan and a bachelor, residing alone in lodgings in London. By profession I am a hydraulic engineer, and I have had considerable experience of my work during the seven years that I was apprenticed to Venner & Matheson, the well-known firm, of Greenwich. Two years ago, having served my time, and having also come into a fair sum of money through my poor father’s


Predicting chars:   0%|          | 0/256 [00:00<?, ? char/s]

Output:
 brown which I have been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been been b
CPU times: user 32 s, sys: 5.6 s, total: 37.6 s
Wall time: 33.8 s


In [21]:
checkpoint.step.assign_add(len(history.epoch))
ckpt_manager.save()

'ckpt/V1/ckpt-1'

In [22]:
model.save(MODELS_DIR)

