<a href="https://colab.research.google.com/github/pfelesova/dat.veda/blob/main/Shakespear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data_path = '/content/drive/MyDrive/sonety.txt'
with open(data_path, 'r') as f:
    lines = f.read().split('\n')

In [5]:
tokenizer = Tokenizer()
corpus = lines

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [6]:
input_sequence = []
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i]
    input_sequence.append(n_gram_sequence)

input_sequence[0:10]

[[34],
 [34, 413],
 [34, 413, 875],
 [34, 413, 875, 166],
 [34, 413, 875, 166, 213],
 [8],
 [8, 876],
 [8, 876, 134],
 [8, 876, 134, 348],
 [8, 876, 134, 348, 99]]

In [7]:
max_sequence_len = max([len(x) for x in input_sequence])
max_sequence_len

10

In [8]:
input_sequence = np.array(pad_sequences(input_sequence, maxlen = max_sequence_len, padding = 'pre'))
input_sequence[:10]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,  34],
       [  0,   0,   0,   0,   0,   0,   0,   0,  34, 413],
       [  0,   0,   0,   0,   0,   0,   0,  34, 413, 875],
       [  0,   0,   0,   0,   0,   0,  34, 413, 875, 166],
       [  0,   0,   0,   0,   0,  34, 413, 875, 166, 213],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   8],
       [  0,   0,   0,   0,   0,   0,   0,   0,   8, 876],
       [  0,   0,   0,   0,   0,   0,   0,   8, 876, 134],
       [  0,   0,   0,   0,   0,   0,   8, 876, 134, 348],
       [  0,   0,   0,   0,   0,   8, 876, 134, 348,  99]], dtype=int32)

In [9]:
xs = input_sequence[:,:-1]
labels = input_sequence[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)

In [None]:
model = Sequential()
model.add(Embedding(total_words, 150, input_length = max_sequence_len - 1))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(total_words, activation = 'softmax'))
adam = Adam(learning_rate = 0.7)
model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
history = model.fit(xs, ys, epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [None]:
seed_text = "Sonet"
next_words = 20

In [None]:
for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen = max_sequence_len -1, padding = 'pre')
  # predicted = model.predict_classes(token_list, verbose = 0)

  predict_x = model.predict(token_list) 
  classes_x = np.argmax(predict_x, axis=1)

  output_word = ""
  for word, index in tokenizer.word_index.items():
    if index == classes_x:
      output_word = word
      break
  seed_text += " " + output_word

print(seed_text) 
