In [3]:
from PyPDF2 import PdfReader
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
import pickle
import os


In [5]:
file_path = "./antigone.pdf"

In [None]:
reader = PdfReader(file_path)
start_page = 6
text = ""

for page in reader.pages[start_page:]:
    text += re.sub(r"[0-9\n()]", '', page.extract_text())



In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [None]:
input_sequences=[]
for line in text.split('.'):
  if line.strip() != '':
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [None]:
## Pad Sequences
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

115

In [None]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))

In [None]:
X,y = input_sequences[:,:-1],input_sequences[:,-1]

In [None]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


In [None]:
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping=EarlyStopping(
    monitor="loss",
    patience=20,
    verbose=0,
    restore_best_weights=True,
)
model.fit(X, y,
          epochs=100,
          validation_split=0.2,
          verbose=1,
          batch_size=64,
          callbacks=[early_stopping])

## Save the model
model.save("model.keras")
## Save the tokenizer
with open('/tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

Epoch 1/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 205ms/step - accuracy: 0.0186 - loss: 7.0359 - val_accuracy: 0.0135 - val_loss: 6.6756
Epoch 2/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 188ms/step - accuracy: 0.0232 - loss: 6.2267 - val_accuracy: 0.0340 - val_loss: 6.7743
Epoch 3/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 190ms/step - accuracy: 0.0361 - loss: 6.1367 - val_accuracy: 0.0408 - val_loss: 6.8653
Epoch 4/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 172ms/step - accuracy: 0.0467 - loss: 5.9718 - val_accuracy: 0.0459 - val_loss: 6.8693
Epoch 5/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 174ms/step - accuracy: 0.0549 - loss: 5.8166 - val_accuracy: 0.0526 - val_loss: 6.8499
Epoch 6/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 190ms/step - accuracy: 0.0669 - loss: 5.6553 - val_accuracy: 0.0639 - val_loss: 6.8506
Epoc

In [8]:
model = load_model("model.keras")
with open('/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
max_sequence_len=115
seed_text = "Dans le palais de Thèbes, on pouvait entendre"
# predict word number
next_words = 15

for _ in range(next_words):
    #convert to token
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    #path sequences
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
