## Creating An AI-Based JFK Speech Writer: Part 2

In [1]:
import numpy as np
import tensorflow as tf 
tf.compat.v1.logging.set_verbosity('ERROR')
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
from google.oauth2 import service_account
from google.cloud import storage
from google.cloud.exceptions import Conflict

credentials = service_account.Credentials.from_service_account_file('credentials.json')

client = storage.Client(project=credentials.project_id,
                        credentials=credentials)

In [3]:
bucket = client.get_bucket("harmon-kennedy")

In [4]:
blob = bucket.blob("all_jfk_speeches.txt")

In [5]:
text = blob.download_as_text()

In [6]:
print(f'Length of text: {len(text)} characters')

Length of text: 7734579 characters


In [7]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

67 unique characters


In [8]:
text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']

In [9]:
print(f'Length of text: {len(text_in_words)} words')

Length of text: 1338872 words


In [10]:
print(f"{len(set(text_in_words))} unique words")

42240 unique words


In [11]:
import string
 
# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [12]:
clean_words = np.array(clean_doc(text))

In [13]:
clean_text = " ".join(clean_words)

In [14]:
clean_text[:300]

'of particular importance to south dakota are the farm policies of the republican party the party of benson nixon and mundt the party which offers our young people no incentive to return to the farm which offers the farmer only the prospect of lower and lower income and which offers the nation the vi'

In [15]:
print(f"{len(clean_words)} number of clean words")

1322685 number of clean words


In [16]:
print(f"{len(set(clean_words))} unique clean words")

22681 unique clean words


In [17]:
len(clean_text)

7533442

In [18]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vocab_size = 10000
seq_length = 20

vectorize_layer = TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=vocab_size,
    output_mode="int",
    pad_to_max_tokens=True,
    output_sequence_length=seq_length,
)

Metal device set to: Apple M1


2023-04-05 16:00:15.343286: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-04-05 16:00:15.343452: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [19]:
vectorize_layer.adapt([text])

2023-04-05 16:00:16.178013: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-04-05 16:00:16.215535: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [20]:
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [21]:
word_index['of']

3

In [22]:
word_index['particular']

758

In [23]:
words_seq = [clean_words[i:i + seq_length] for i in range(0, len(clean_words) - seq_length-1)]
next_word = [clean_words[i + seq_length] for i in range(0, len(clean_words) - seq_length-1)]

In [24]:
len(words_seq)

1322664

In [25]:
next_cat = np.array([word_index.get(word, 1) for word in next_word])

In [26]:
next_cat.shape

(1322664,)

In [27]:
X = list()
y = list()

for i in range(len(next_word)):
    if next_cat[i]  != 1:
        X.append(" ".join(words_seq[i]))
        y.append(next_cat[i])

In [28]:
y = tf.keras.utils.to_categorical(y)

In [29]:
X = np.array(X)

In [30]:
y.shape

(1299332, 10000)

In [31]:
X.shape

(1299332,)

In [32]:
X[0]

'of particular importance to south dakota are the farm policies of the republican party the party of benson nixon and'

In [33]:
vectorize_layer.call(X[0])

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([   3,  758,  692,    5,  430, 2268,   16,    2,  156,  280,    3,
          2,  152,   68,    2,   68,    3,  756,  193,    4])>

In [34]:
next_word[:2]

['mundt', 'the']

In [35]:
word_index['the']

2

In [36]:
word_index['nation']

92

In [37]:
word_index['[UNK]']

1

In [38]:
y[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [39]:
np.argmax(y[1])

2

In [40]:
y.shape

(1299332, 10000)

In [41]:
X.shape

(1299332,)

In [42]:
y.shape[1]

10000

In [43]:
y.shape

(1299332, 10000)

https://stackoverflow.com/questions/44273249/in-keras-what-exactly-am-i-configuring-when-i-create-a-stateful-lstm-layer-wi

In [44]:
embedding_dim = 128

model = tf.keras.models.Sequential([
                tf.keras.Input(shape=(1,), 
                               dtype=tf.string, 
                               name='text'),
                vectorize_layer,
                tf.keras.layers.Embedding(vocab_size, embedding_dim),
                tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
                tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

In [45]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 20)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 20, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              74496     
 l)                                                              
                                                                 
 dense (Dense)               (None, 10000)             1290000   
                                                                 
Total params: 2,644,496
Trainable params: 2,644,496
Non-trainable params: 0
_________________________________________________________________


In [46]:
from sklearn.utils import shuffle

In [49]:
X_train, y_train = X[:100000], y[:100000]

In [50]:
X_train, y_train = shuffle(X_train, y_train)

In [51]:
X_train[0]

'this growing deterioration of our natural resources despite the pressures of population growth we have in the past several years'

In [52]:
model.fit(X_train, y_train, epochs =20, batch_size=128, validation_split=0.2)

Epoch 1/20


2023-04-05 16:01:06.642846: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:01:06.854292: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:01:06.863950: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:01:07.038112: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:01:07.051988: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-04-05 16:01:24.646652: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:01:24.715426: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:01:24.722140: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2d79b94c0>

In [59]:
test = str(X[100020])

In [60]:
y_pred = model.predict([test])

2023-04-05 16:08:04.225998: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:08:04.290177: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-05 16:08:04.297113: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [61]:
reverse_word_map = dict(map(reversed, word_index.items()))

In [62]:
reverse_word_map[np.argmax(y_pred[0])]

'going'

In [98]:
# reverse_word_map

In [66]:
def next_tokens(input_str, n):
    print ("Seed -",  input_str, sep = '\n\n')
    final_string = ''
    for i in range(n):
        prediction = model.predict([input_str], verbose=0)
        final_string = final_string + reverse_word_map[np.argmax(prediction[0])] + ' ' 
        input_str = input_str + ' ' + reverse_word_map[np.argmax(prediction[0])]
        input_str = ' '.join(input_str.split(' ')[1:])
    return final_string

In [67]:
next_tokens(test, 5)

Seed -

indifference where franklin roosevelt opened new horizons this administration sets ceilings where roosevelt urged a spirit of selfsacrifice we are


'going to rebuild the disabled '

In [70]:
next_tokens("ask not what your country can do for you ask", 3)

Seed -

ask not what your country can do for you ask


'the and which '