In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from random import randint

In [3]:
file = open('Ancient_Modern_Physics.txt','r')
text = file.read()
file.close()
text[:1000]

'ANCIENT AND MODERN PHYSICS\n\nby Thomas E. Willson\n\n\n\nContents\n\nPreface\nI.     Physical Basis of Metaphysics\nII.    The Two Kinds of Perception\nIII.   Matter and Ether\nIV.    What a Teacher Should Teach\nV.     The Four Manifested Planes\nVI.    One Place on Earth\nVII.   The Four Globes\nVIII.  The Battle Ground\nIX.    The Dual Man\nX.     The Septenary World\nXI.    Stumbling blocks in Eastern Physics\n\n\n\n\nPREFACE\n\n\nThe Editor of the Theosophical Forum in April, 1901, noted the\ndeath of Mr. Thomas E. Willson in the previous month in an\narticle which we reproduce for the reason that we believe many\nreaders who have been following the chapters of "Ancient and\nModern Physics" during the last year will like to know something\nof the author.  In these paragraphs is said all that need be said\nof one of our most devoted and understanding Theosophists.\n\nIn March, 1901, The Theosophical Forum lost one of its most\nwilling and unfailing contributors.  Mr. T.E. Willson

# Cleaning Text

In [4]:
tokens = text.lower()
print(tokens[:500])
#
n_chars = len(tokens)
unique_vocab = len(set(tokens))
print('Total Tokens: %d' % n_chars)
print('Unique Tokens: %d' % unique_vocab)
#
characters = sorted(list(set(tokens)))
n_vocab = len(characters)
print('N Vocab:',n_vocab)
#
int_to_char = {n:char for n, char in enumerate(characters)}
char_to_int = {char:n for n, char in enumerate(characters)}

ancient and modern physics

by thomas e. willson



contents

preface
i.     physical basis of metaphysics
ii.    the two kinds of perception
iii.   matter and ether
iv.    what a teacher should teach
v.     the four manifested planes
vi.    one place on earth
vii.   the four globes
viii.  the battle ground
ix.    the dual man
x.     the septenary world
xi.    stumbling blocks in eastern physics




preface


the editor of the theosophical forum in april, 1901, noted the
death of mr. thomas e. w
Total Tokens: 126361
Unique Tokens: 51
N Vocab: 51


# Creating Dataset

In [5]:
X = []
y = []
seq_length = 100

for i in range(0, n_chars - seq_length, 1):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    X.append([char_to_int[char] for char in seq_in])
    y.append(char_to_int[seq_out])

In [6]:
len(tokens)

126361

In [7]:
len(X),len(y)

(126261, 126261)

In [8]:
print(X[0])

[25, 38, 27, 33, 29, 38, 44, 1, 25, 38, 28, 1, 37, 39, 28, 29, 42, 38, 1, 40, 32, 49, 43, 33, 27, 43, 0, 0, 26, 49, 1, 44, 32, 39, 37, 25, 43, 1, 29, 8, 1, 47, 33, 36, 36, 43, 39, 38, 0, 0, 0, 0, 27, 39, 38, 44, 29, 38, 44, 43, 0, 0, 40, 42, 29, 30, 25, 27, 29, 0, 33, 8, 1, 1, 1, 1, 1, 40, 32, 49, 43, 33, 27, 25, 36, 1, 26, 25, 43, 33, 43, 1, 39, 30, 1, 37, 29, 44, 25, 40]


In [9]:
print(y[0])

32


In [10]:
X_new = np.reshape(X, (len(X), seq_length, 1)) #samples, time steps, features
X_new = X_new / float(n_vocab) #normalizing the values

y_new = to_categorical(y) #one hot encode

In [11]:
print("X_new shape:", X_new.shape)
print("y_new shape:", y_new.shape)

X_new shape: (126261, 100, 1)
y_new shape: (126261, 51)


In [12]:
y_new[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

# Creating Model

In [13]:
model = Sequential()
model.add(LSTM(350,input_shape=(X_new.shape[1],X_new.shape[2]),return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(350,return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(350))
model.add(Dropout(0.5))
model.add(Dense(y_new.shape[1], activation='softmax'))

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100, 350)          492800    
_________________________________________________________________
dropout (Dropout)            (None, 100, 350)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 350)          981400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 350)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 350)               981400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 350)               0         
_________________________________________________________________
dense (Dense)                (None, 51)                1

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [16]:
model.fit(X_new, y_new, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8870418710>

In [17]:
# model.save('../data/text_generation/text_generation_model.h5')
# model_ = load_model('../data/text_generation/text_generation_model.h5')

Testing

In [18]:
ini = np.random.randint(0, len(X)-1)
token_string = X[ini]

In [19]:
ini,X[ini]

(2333,
 [29,
  1,
  44,
  32,
  29,
  1,
  42,
  39,
  39,
  30,
  43,
  1,
  39,
  30,
  1,
  44,
  32,
  29,
  1,
  44,
  25,
  36,
  36,
  29,
  43,
  44,
  1,
  32,
  39,
  45,
  43,
  29,
  43,
  1,
  33,
  38,
  1,
  44,
  32,
  29,
  0,
  28,
  33,
  43,
  44,
  42,
  33,
  27,
  44,
  8,
  1,
  1,
  44,
  32,
  29,
  42,
  29,
  1,
  32,
  29,
  1,
  43,
  25,
  44,
  1,
  25,
  44,
  1,
  32,
  33,
  43,
  1,
  28,
  29,
  43,
  35,
  6,
  1,
  31,
  29,
  38,
  29,
  42,
  25,
  36,
  36,
  49,
  1,
  33,
  38,
  1,
  32,
  33,
  43,
  1,
  43,
  32,
  33,
  42,
  44])

In [20]:
complete_string = [int_to_char[value] for value in token_string]

print ("\"", ''.join(complete_string), "\"")

" e the roofs of the tallest houses in the
district.  there he sat at his desk, generally in his shirt "


In [21]:
for i in range(500):
    x = np.reshape(token_string, (1, len(token_string), 1))
    x = x / float(n_vocab)
    
    prediction = model.predict(x, verbose=0)

    id_pred = np.argmax(prediction)
    seq_in = [int_to_char[value] for value in token_string]
    
    complete_string.append(int_to_char[id_pred])
    
    token_string.append(id_pred)
    token_string = token_string[1:len(token_string)] 

In [22]:
# Show Text
text = ""
for char in complete_string:
    text = text + char
print(text)

e the roofs of the tallest houses in the
district.  there he sat at his desk, generally in his shirt                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    


Ou seja, modelo n√£o ficou muito bom...