#### TEXT GENERATION

In [2]:
import tensorflow as tf

In [4]:
tf.__version__

'2.6.0'

In [5]:
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from random import randint

In [8]:
file = open('./Ancient_Modern_Physics.txt','r')
text = file.read()
file.close()
#text[:1000]

## Cleaning the text

In [9]:
tokens = text.lower()
print(tokens[:500])

ancient and modern physics

by thomas e. willson



contents

preface
i.     physical basis of metaphysics
ii.    the two kinds of perception
iii.   matter and ether
iv.    what a teacher should teach
v.     the four manifested planes
vi.    one place on earth
vii.   the four globes
viii.  the battle ground
ix.    the dual man
x.     the septenary world
xi.    stumbling blocks in eastern physics




preface


the editor of the theosophical forum in april, 1901, noted the
death of mr. thomas e. w


In [10]:
n_chars = len(tokens)
unique_vocab = len(set(tokens))
print('Total Tokens: %d' % n_chars)
print('Unique Tokens: %d' % unique_vocab)

Total Tokens: 126361
Unique Tokens: 51


In [11]:
set(tokens)

{'\n',
 ' ',
 '"',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '[',
 ']',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [12]:
characters = sorted(list(set(tokens)))
n_vocab = len(characters)
n_vocab

51

In [13]:
int_to_char = {n:char for n, char in enumerate(characters)}
char_to_int = {char:n for n, char in enumerate(characters)}

## Creating datasets:

In [14]:
X = []
y = []
seq_length = 100

for i in range(0, n_chars - seq_length, 1):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    X.append([char_to_int[char] for char in seq_in])
    y.append(char_to_int[seq_out])

In [15]:
print(X[0])

[25, 38, 27, 33, 29, 38, 44, 1, 25, 38, 28, 1, 37, 39, 28, 29, 42, 38, 1, 40, 32, 49, 43, 33, 27, 43, 0, 0, 26, 49, 1, 44, 32, 39, 37, 25, 43, 1, 29, 8, 1, 47, 33, 36, 36, 43, 39, 38, 0, 0, 0, 0, 27, 39, 38, 44, 29, 38, 44, 43, 0, 0, 40, 42, 29, 30, 25, 27, 29, 0, 33, 8, 1, 1, 1, 1, 1, 40, 32, 49, 43, 33, 27, 25, 36, 1, 26, 25, 43, 33, 43, 1, 39, 30, 1, 37, 29, 44, 25, 40]


In [16]:
print(y[0])

32


In [17]:
X_new = np.reshape(X, (len(X), seq_length, 1)) #samples, time steps, features
X_new = X_new / float(n_vocab) #normalizing the values

y_new = to_categorical(y) #one hot encode

In [18]:
print("X_new shape:", X_new.shape)
print("y_new shape:", y_new.shape)

X_new shape: (126261, 100, 1)
y_new shape: (126261, 51)


In [19]:
y_new[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

## Creating the model:

In [20]:
model = Sequential()
model.add(LSTM(700, input_shape=(X_new.shape[1], X_new.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(700, return_sequences=True))
model.add(Dropout(0.2)) 
model.add(LSTM(700))
model.add(Dropout(0.2))
model.add(Dense(y_new.shape[1], activation='softmax'))

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100, 700)          1965600   
_________________________________________________________________
dropout (Dropout)            (None, 100, 700)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 700)          3922800   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 700)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 700)               3922800   
_________________________________________________________________
dropout_2 (Dropout)          (None, 700)               0         
_________________________________________________________________
dense (Dense)                (None, 51)                3

In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

## Training the model:

In [23]:
%%time
model.fit(X_new, y_new, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 36min 35s, sys: 40.1 s, total: 37min 15s
Wall time: 51min 19s


<keras.callbacks.History at 0x7f142e09b450>

## Save the model:

In [24]:
model.save('../data/text_generation/text_generation_model.h5')

## Load Model:

In [25]:
model_ = load_model('../data/text_generation/text_generation_model.h5')

## Testing a random sample:

In [26]:
ini = np.random.randint(0, len(X)-1)
token_string = X[ini]

In [27]:
complete_string = [int_to_char[value] for value in token_string]

print ("\"", ''.join(complete_string), "\"")

" he kinetic belt of the manasic globe
have reached the pranic.  only a few of those within the pranic "


In [None]:
for i in range(500):
    x = np.reshape(token_string, (1, len(token_string), 1))
    x = x / float(n_vocab)
    
    prediction = model_.predict(x, verbose=0)

    id_pred = np.argmax(prediction)
    seq_in = [int_to_char[value] for value in token_string]
    
    complete_string.append(int_to_char[id_pred])
    
    token_string.append(id_pred)
    token_string = token_string[1:len(token_string)] 

## Showing the generated text:

In [None]:
text = ""
for char in complete_string:
    text = text + char
print(text)