<a href="https://colab.research.google.com/github/marcocivico/LanDyn/blob/master/LSTM_JaneAusten.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import random
import io

In [3]:
text=[]
with open("jane_austen_novels_UTF.txt", "r", encoding="UTF-8") as file:
    text = file.read()

print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Corpus length: 4305272
Total chars: 79
Number of sequences: 1435078


In [3]:
# EDA: Generating WordCloud to visualize
# the text
wordcloud = WordCloud(max_font_size=50,
                      max_words=100,
                      background_color="black").generate(text)

NameError: ignored

In [None]:
# Plotting the WordCloud
plt.figure(figsize=(8, 4))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("WordCloud.png")
plt.show()

In [5]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [6]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [7]:
epochs = 40
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: "y was convinced of his father's believin"
...Generated:  g to the rease to be all the forther at the some as she was a sister the come to see the complation to the look and any the was as so man the more to some have been a such a some all the compors as the comporion of the parting to the come to be allowed the streing to her for the thought to the some to be so dear and person and such a some to be a some to be allowing to be allowing to the reasing t

...Diversity: 0.5
...Generating with seed: "y was convinced of his father's believin"
...Generated:  g by in it all to all the one of his compertion and again to the realons would be amore of the consent to not. I conlended before the compors compless to farthing her consintance of the looks as sich was the composion of her to the exert and astence of his propess, and she the prepent was so not to have been satied and the forther it conded to this more the mort

  preds = np.log(preds) / temperature


...Generated:   day. Mus again he could even believed, and he prepunsed to be dines out of a evils but autton, and Edmund modaicestiche and with a smarronul light. Thismea!-yes. "Can dear of releasedome done every body's superior. Agend immediate had been in judgming that she did  softened." "Mill"y else. But from theirsphar. The crue"d intimate stupid bosh a.vight, happiness.exhorted, to Kitty than the matter f


Generating text after epoch: 26
...Diversity: 0.2
...Generating with seed: "iged to give up the point and submit. He"
...Generated:   had been to see him to be a continuance of the mother and disposed to his father and a most complete and seeing it with a complete thing, and she was a very subject of the present and a moment, and the strong way of the property of the side of the present and his sister and the subject, and so much as to the greatest strong to her in the same to the subject of her own part of the affectionate and

...Diversity: 0.5
...Generating with seed: "ige

In [None]:
start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.1, 0.3, 0.5, 0.8, 1.0, 1.2, 1.5, 2]:
    print("...Diversity:", diversity)

    generated = ""
    sentence = text[start_index : start_index + maxlen]
    print('...Generating with seed: "' + sentence + '"')

    for i in range(2000000):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        sentence = sentence[1:] + next_char
        generated += next_char

    with open(f'sample_lstm_char_t={diversity}.txt', 'w') as f:
      print(generated, file=f)

...Diversity: 0.1
...Generating with seed: "ers!" "They will not have much cause of "
