URL:

https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

In [1]:
import numpy as np
np.random.seed(42)

In [2]:
from NLP_Help_CleanText import fun_percent_change, fun_get_data, fun_clean_stopwords_punct
from NLP_Help_CleanText import fun_clean_lemmatization, fun_clean_removing_unwantedWords
from NLP_Help_CleanText import fun_clean_removing_commonWords, fun_clean_removing_rareWords
# from NLP_Help_CleanText import fun_clean_text

In [3]:
from IPython import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Get data

In [4]:
# Get Data
original_text = fun_get_data('AliceInWonderLand.txt')
len(original_text)
type(original_text)

<< 1. Reading Data >>
Actual Length of Text :  163817


163817

str

In [5]:
original_text



# Data Preparation

### Make Data Clean

In [6]:
# Clean Data
# 1: Remove Stop Words and Puctuation
clean_text = fun_clean_stopwords_punct(original_text)
    
# 2: Lemmatization
clean_text_lem_nltk = fun_clean_lemmatization(clean_text, original_text)
   
# 3. Removing unwanted text
clean_text_char_rem = fun_clean_removing_unwantedWords(clean_text_lem_nltk, original_text)
    
# 4. Removing Common Words
clean_text_post_commonWord_removal = fun_clean_removing_commonWords(clean_text_char_rem, original_text)
    
# 5. Removing Rare Occuring Words
clean_text_post_rareWord_removal = fun_clean_removing_rareWords(clean_text_post_commonWord_removal, 
                                                                original_text)

************************************
<< 2. Applying StopWords and Punctuation removal >>
Length post removing stopwords and punctuation :  111242
Percent Change is:  32.09
None
************************************
<< 3. Applying Lemmatization >>
Length of text post applying Lemmatization :  106691
Percent Change is:  34.87
None
************************************
<< 4. Applying Removal of Unwanted Characters >>
Length of text post removal of Unwanted Characters :  106126
Percent Change is:  35.22
None
************************************
<< 5. Applying Removal of Common or Frequently Occuring Words >>
Length of text post removal of Common Words :  94784
Percent Change is:  42.14
None
************************************
<< 6. Applying Removal of Rare Occuring Words >>
Length of text post removal of Rare Orccuring Words :  94636
Percent Change is:  42.23
None


In [7]:
clean_text_post_rareWord_removal



# Converting Characters into Numbers

In [8]:
chars = sorted(set(clean_text_post_rareWord_removal))
char_to_int = dict((c,i) for i,c in enumerate(chars))
char_to_int

{' ': 0,
 '!': 1,
 '$': 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '-': 6,
 '.': 7,
 '/': 8,
 '0': 9,
 '1': 10,
 '2': 11,
 '3': 12,
 '4': 13,
 '5': 14,
 '6': 15,
 '7': 16,
 '8': 17,
 '9': 18,
 ':': 19,
 ';': 20,
 '?': 21,
 '[': 22,
 ']': 23,
 '_': 24,
 'a': 25,
 'b': 26,
 'c': 27,
 'd': 28,
 'e': 29,
 'f': 30,
 'g': 31,
 'h': 32,
 'i': 33,
 'j': 34,
 'k': 35,
 'l': 36,
 'm': 37,
 'n': 38,
 'o': 39,
 'p': 40,
 'q': 41,
 'r': 42,
 's': 43,
 't': 44,
 'u': 45,
 'v': 46,
 'w': 47,
 'x': 48,
 'y': 49,
 'z': 50,
 '‘': 51,
 '’': 52,
 '“': 53,
 '”': 54}

## Reversing this Dictionary i.e. Int-To-Chars

In [9]:
int_to_char = dict((i,c) for i,c in enumerate(chars))
int_to_char

{0: ' ',
 1: '!',
 2: '$',
 3: '(',
 4: ')',
 5: ',',
 6: '-',
 7: '.',
 8: '/',
 9: '0',
 10: '1',
 11: '2',
 12: '3',
 13: '4',
 14: '5',
 15: '6',
 16: '7',
 17: '8',
 18: '9',
 19: ':',
 20: ';',
 21: '?',
 22: '[',
 23: ']',
 24: '_',
 25: 'a',
 26: 'b',
 27: 'c',
 28: 'd',
 29: 'e',
 30: 'f',
 31: 'g',
 32: 'h',
 33: 'i',
 34: 'j',
 35: 'k',
 36: 'l',
 37: 'm',
 38: 'n',
 39: 'o',
 40: 'p',
 41: 'q',
 42: 'r',
 43: 's',
 44: 't',
 45: 'u',
 46: 'v',
 47: 'w',
 48: 'x',
 49: 'y',
 50: 'z',
 51: '‘',
 52: '’',
 53: '“',
 54: '”'}

In [10]:
num_chars = len(clean_text_post_rareWord_removal)
vocab_size = len(char_to_int)
num_chars
vocab_size

94636

55

# Input and Output data

In [11]:
sequence_length = 100
input_data = []
output_data = []

In [12]:
for i in range(num_chars - sequence_length):
    seq_input = clean_text_post_rareWord_removal[i:i+sequence_length]
    seq_output = clean_text_post_rareWord_removal[i+sequence_length]
    input_data.append([char_to_int[char] for char in seq_input])
    output_data.append(char_to_int[seq_output])
len(input_data)
len(output_data)

94536

94536

In [13]:
input_data[0][-5:]
output_data[0]
input_data[1][-5:]
output_data[1]
clean_text_post_rareWord_removal[:105]

[29, 43, 44, 42, 33]

27

[43, 44, 42, 33, 27]

44

'gutenberg’s alice’s adventure wonderland, lewis carroll ebook use anyone anywhere cost almost restriction'

In [14]:
np.max(input_data)
np.min(input_data)
np.max(output_data)
np.min(output_data)

54

0

54

0

## Reshaping Input_Data into 3D for LSTM to be fed with. 

Shape is:
- Sample Size
- Sequence Length
- Featurer or representation for each word which is 1 in this case

In [26]:
X = np.reshape(input_data, newshape=(len(input_data), sequence_length, 1))
X.shape

(94536, 100, 1)

In [24]:
X.shape[0]
X.shape[1]
X.shape[2]

94536

100

IndexError: tuple index out of range

## Standardize Input Data

In [16]:
X = X/np.max(input_data)
np.max(X)
np.min(X)

1.0

0.0

## One-Hot-Encoding for Output Data

In [17]:
from tensorflow.python.keras.utils import to_categorical

In [18]:
Y = to_categorical(output_data, num_classes=vocab_size)
Y.shape

(94536, 55)

In [19]:
Y[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.], dtype=float32)

## Model Building

In [20]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

Using TensorFlow backend.


In [27]:
model = Sequential()
model.add(LSTM(units=128, input_shape=(X.shape[1], 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=128))
model.add(Dropout(0.2))
model.add(Dense(units=vocab_size, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 128)          66560     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 55)                7095      
Total params: 205,239
Trainable params: 205,239
Non-trainable params: 0
_________________________________________________________________


# Using Model-Checkpoint Callbacks

In [28]:
from keras.callbacks import ModelCheckpoint

In [29]:
# define the checkpoint
_callbackPath = 'model_checkPoints/TextGeneration_Simple_{epoch:02d}_{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(_callbackPath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [30]:
model.fit(x=X, y=Y, batch_size=2048, epochs=10, callbacks=callbacks_list)

Epoch 1/10

Epoch 00001: loss improved from inf to 3.26850, saving model to model_checkPoints/TextGeneration_Simple_01_3.2685.hdf5
Epoch 2/10

Epoch 00002: loss improved from 3.26850 to 3.00996, saving model to model_checkPoints/TextGeneration_Simple_02_3.0100.hdf5
Epoch 3/10

Epoch 00003: loss improved from 3.00996 to 2.90918, saving model to model_checkPoints/TextGeneration_Simple_03_2.9092.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.90918 to 2.85880, saving model to model_checkPoints/TextGeneration_Simple_04_2.8588.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.85880 to 2.82196, saving model to model_checkPoints/TextGeneration_Simple_05_2.8220.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.82196 to 2.79658, saving model to model_checkPoints/TextGeneration_Simple_06_2.7966.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.79658 to 2.77619, saving model to model_checkPoints/TextGeneration_Simple_07_2.7762.hdf5
Epoch 8/10

Epoch 00008: loss improved from 2.77619 to 2.75

<keras.callbacks.History at 0x1f9bdda4ac8>

# Using Random Seed to get predictions

In [31]:
random_start = np.random.randint(0, num_chars - sequence_length)
random_start

13364

In [39]:
pattern = input_data[random_start]
len(pattern)
pattern[:5]

100

[39, 47, 7, 0, 43]

In [40]:
''.join(int_to_char[num] for num in pattern)
clean_text_post_rareWord_removal[random_start : random_start + sequence_length+ 10]

'ow. silence round, please! “william conqueror, whose cause favour pope, soon submit english, want le'

'ow. silence round, please! “william conqueror, whose cause favour pope, soon submit english, want leaders, lat'

# Prediction

In [34]:
pattern = np.reshape(pattern, newshape=(1,len(pattern),1))
pattern.shape
pattern = pattern / np.max(pattern)

(1, 100, 1)

In [35]:
prediction = model.predict(pattern)
prediction

array([[0.02431745, 0.01075622, 0.00885848, 0.01107954, 0.03233648,
        0.01456698, 0.0213604 , 0.08976158, 0.01452306, 0.0317695 ,
        0.04562333, 0.01615496, 0.04655788, 0.02367437, 0.04178736,
        0.02094996, 0.02740921, 0.01741192, 0.03751935, 0.01418382,
        0.01347007, 0.00745818, 0.01004265, 0.01236459, 0.01375894,
        0.0164143 , 0.01047161, 0.01969627, 0.01545891, 0.02067186,
        0.01449442, 0.01430692, 0.00972329, 0.01803099, 0.01139734,
        0.00638221, 0.01468007, 0.01013737, 0.01207932, 0.01940475,
        0.00979602, 0.00858661, 0.0144163 , 0.00983398, 0.01424987,
        0.01275064, 0.01509953, 0.01054494, 0.01819365, 0.00820288,
        0.01142012, 0.00903444, 0.01864596, 0.01005407, 0.00812532]],
      dtype=float32)

In [36]:
prediction.shape
np.max(prediction)
np.argmax(prediction)

(1, 55)

0.08976158

7

In [37]:
int_to_char[np.argmax(prediction)]

'.'

In [41]:
# Checking for Range
char_index_pred = []
for i in range(100):
    pattern_modified = np.reshape(pattern, newshape=(1,len(pattern),1))
    pattern_modified = pattern_modified/np.max(pattern_modified)
    
    prediction = model.predict(pattern_modified)
    index_pred = np.argmax(prediction)
    char_index_pred.append(index_pred)
    
    char_pred = int_to_char[index_pred]
    print(char_pred, end='')
    
    pattern.append(index_pred)
    pattern = pattern[1:len(pattern)]

print('\n')
print(char_index_pred)

.............333333333333333333333333333333333333333333333333333333333333333333333333333333333333.3.

[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 7, 12, 7]
