In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import layers
from keras.models import Model
from keras.models import Sequential
from keras.utils import np_utils
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from pathlib import Path


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#Read File


In [None]:
import re
file = open("dataset.txt").read()
file = file.lower()

#Preproccess Text


In [None]:
chars = sorted(list(set(file)))
print(chars)

['\t', '\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '6', '7', '9', ':', ';', '?', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '}', 'ü', '•']


In [None]:
clean_text = re.sub(r'[^a-z0-9 ]', ' ', file)

In [None]:
words = word_tokenize(clean_text) 
text = " ".join(words)

In [None]:
chars = sorted(list(set(text)))
print(chars)

[' ', '0', '1', '2', '3', '4', '6', '7', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


#Dictionary

In [None]:
uniqueChar = sorted(list(set(text)))

text_len = len(text) 
char_len = len(uniqueChar) 

mapping = dict((c, i) for i, c in enumerate(uniqueChar))
reverse_mapping = dict((i, c) for i, c in enumerate(uniqueChar))

In [None]:
print("Total number of characters:", text_len)
print("Number of unique characters:", char_len)

Total number of characters: 1044797
Number of unique characters: 35


In [None]:
seq_length = 30
x = []
y = []

In [None]:
for i in range(0, text_len - seq_length, 1):
    in_seq = text[i:i + seq_length]

    out_seq = text[i + seq_length]

    x.append([mapping[char] for char in in_seq])
    y.append(mapping[out_seq])
    
n_patterns = len(x)


In [None]:
print("Number of patterns in text = ", n_patterns)

Number of patterns in text =  1044767


In [None]:
X = np.reshape(x, (n_patterns, seq_length, 1))
X = X/float(char_len)
Y = np_utils.to_categorical(y)

In [None]:
print(X.shape)
print(Y.shape)

(1044767, 30, 1)
(1044767, 35)


#Creat Model


In [None]:
model = Sequential()
model.add(layers.LSTM(512, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(512))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(256, activation='tanh'))
model.add(layers.Dense(Y.shape[1], activation='softmax'))
model.summary()
model.compile(optimizer='Nadam', loss = 'categorical_crossentropy', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 512)           1052672   
                                                                 
 dropout (Dropout)           (None, 30, 512)           0         
                                                                 
 lstm_1 (LSTM)               (None, 512)               2099200   
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dense_1 (Dense)             (None, 35)                8995      
                                                                 
Total params: 3,292,195
Trainable params: 3,292,195
Non-

In [None]:
history = model.fit(X, Y, batch_size = 64, epochs = 5, shuffle = True)

Epoch 1/5
   22/16325 [..............................] - ETA: 5:31:56 - loss: 3.0663 - accuracy: 0.1669

KeyboardInterrupt: ignored

In [None]:
plt.plot(history.history['loss']) 
plt.title('Model loss') 
plt.ylabel('Loss') 
plt.xlabel('Epoch') 
plt.legend(['Train'], loc='upper right') 
plt.show()

In [None]:
plt.plot(history.history['accuracy']) 
plt.title('Model accuracy') 
plt.ylabel('accuracy') 
plt.xlabel('Epoch') 
plt.legend(['Train'], loc='lower right') 
plt.show()

In [None]:
def Text_Generator(Ch_count):
    start = np.random.randint(0, len(x) - 1)
    seed = x[start]
    seq_in = "".join([reverse_mapping[value] for value in seed])
    print(f"...Seed Text: {seq_in}")
    
    for i in range(Ch_count):
        x_pred = np.reshape(seed, (1, len(seed), 1))
        x_pred = x_pred/ float(char_len)
        prediction = model.predict(x_pred, verbose=0)[0]  
        # Getting the index of the next most probable index
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction) / 1.0 
        exp_preds = np.exp(prediction)
        prediction = exp_preds / np.sum(exp_preds)
        probas = np.random.multinomial(1, prediction, 1)
        index = np.argmax(prediction)
        next_char = reverse_mapping[index]
        seq_in += next_char  
        seed.append(index)
        seed = seed[1:len(seed)]
        
    
    return seq_in

In [None]:
print("...Generated: ", Text_Generator(200))
print()