In [1]:
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical
from random import randint

In [2]:
file = open('../data/Ancient_Modern_Physics.txt', 'r')
text = file.read()
text[:40]

'ANCIENT AND MODERN PHYSICS\n\nby Thomas E.'

In [3]:
tokens = text.lower()
tokens[:40]

'ancient and modern physics\n\nby thomas e.'

In [4]:
n_chars = len(tokens)
unique_vocab = len(set(tokens))
print(f'Total Tokens: {n_chars}')
print(f'Unique Tokens: {unique_vocab}')

Total Tokens: 126361
Unique Tokens: 51


In [5]:
characters = sorted(list(set(tokens)))
n_vocab = len(characters)
n_vocab

51

In [6]:
int_to_char = {n:char for n, char in enumerate(characters)}
char_to_int = {char:n for n, char in enumerate(characters)}
char_to_int

{'\n': 0,
 ' ': 1,
 '"': 2,
 "'": 3,
 '(': 4,
 ')': 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '0': 9,
 '1': 10,
 '2': 11,
 '3': 12,
 '4': 13,
 '5': 14,
 '6': 15,
 '7': 16,
 '8': 17,
 '9': 18,
 ':': 19,
 ';': 20,
 '?': 21,
 '[': 22,
 ']': 23,
 '`': 24,
 'a': 25,
 'b': 26,
 'c': 27,
 'd': 28,
 'e': 29,
 'f': 30,
 'g': 31,
 'h': 32,
 'i': 33,
 'j': 34,
 'k': 35,
 'l': 36,
 'm': 37,
 'n': 38,
 'o': 39,
 'p': 40,
 'q': 41,
 'r': 42,
 's': 43,
 't': 44,
 'u': 45,
 'v': 46,
 'w': 47,
 'x': 48,
 'y': 49,
 'z': 50}

In [7]:
X = []
y = []
seq_length = 100

for i in range(0, n_chars - seq_length, 1):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    X.append([char_to_int[char] for char in seq_in])
    y.append(char_to_int[seq_out])

In [8]:
y[0]

32

In [9]:
X_new = np.reshape(X, (len(X), seq_length, 1)) #samples, time steps, features
X_new = X_new / float(n_vocab) #normalizing the values

y_new = to_categorical(y) #one hot encode

In [10]:
print("X_new shape:", X_new.shape)
print("y_new shape:", y_new.shape)

X_new shape: (126261, 100, 1)
y_new shape: (126261, 51)


In [11]:
lstm_units = 700
drop_percent = 0.3
n_epochs = 10

In [12]:
model = Sequential()
model.add(LSTM(lstm_units, input_shape=(X_new.shape[1], X_new.shape[2]), return_sequences=True))
model.add(Dropout(drop_percent))
model.add(LSTM(lstm_units, return_sequences=True))
model.add(Dropout(drop_percent)) 
model.add(LSTM(lstm_units))
model.add(Dropout(drop_percent))
model.add(Dense(y_new.shape[1], activation='softmax'))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100, 700)          1965600   
_________________________________________________________________
dropout (Dropout)            (None, 100, 700)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 700)          3922800   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 700)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 700)               3922800   
_________________________________________________________________
dropout_2 (Dropout)          (None, 700)               0         
_________________________________________________________________
dense (Dense)                (None, 51)                3

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
%%time
model.fit(X_new, y_new, batch_size=32, epochs=n_epochs)

Epoch 1/10
  55/3946 [..............................] - ETA: 7:19:28 - loss: 3.1220

KeyboardInterrupt: 