# Text Generation with Python

**Create function to open Files

In [123]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

## Tokenization for Each words in Doc.

In [124]:
import nltk
from nltk.tokenize import word_tokenize 

In [125]:
raw = read_file('myText.txt')

In [126]:
doc = word_tokenize(raw)

**Remove punctuation and convert into lowercase.

In [127]:
def remove_punc(tokens):
    return [token.lower() for token in tokens if token not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [128]:
mytoken = remove_punc(doc)

In [129]:
len(mytoken)

3868

## Create Sequences of Tokens using NLTK

**The idea is we train 25 word and we predict the 26th word in the sentences, thus the dataset/sequences will be consist of list of 25 words (in list), for each sequences will shifted + 1 for length of the sentences. 

In [130]:
train_len = 25 + 1
text_sequences = []

for i in range(train_len,len(mytoken)):
    seq = mytoken[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

** Example of Dataset/Text Sequences

In [131]:
text_sequences[0]

['butuh',
 'beberapa',
 'saat',
 'bagi',
 'mata',
 'saya',
 'untuk',
 'menyesuaikan',
 'diri',
 'dengan',
 'perubahan',
 'cahaya',
 'bagian',
 'dalam',
 'gubuk',
 'tetua',
 'paruh',
 'tombak',
 'redup',
 'tidak',
 'ada',
 'penerangan',
 'kecuali',
 'kolom',
 'cahaya',
 'tipis']

In [132]:
text_sequences[1]

['beberapa',
 'saat',
 'bagi',
 'mata',
 'saya',
 'untuk',
 'menyesuaikan',
 'diri',
 'dengan',
 'perubahan',
 'cahaya',
 'bagian',
 'dalam',
 'gubuk',
 'tetua',
 'paruh',
 'tombak',
 'redup',
 'tidak',
 'ada',
 'penerangan',
 'kecuali',
 'kolom',
 'cahaya',
 'tipis',
 'yang']

** From sequence 1 to sequence 2 a word shifted, the word 'me' shifted from array number 1 to number 0 (in array order)

## Keras Tokenization

** For each word in vocabulary assign an Unique IDs.

In [133]:
from keras.preprocessing.text import Tokenizer

In [134]:
kToken = Tokenizer()

**
fit_on_texts 
Updates internal vocabulary based on a list of texts. This method creates the vocabulary index based on word frequency. So if you give it something like, "The cat sat on the mat." It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value.

In [135]:
kToken.fit_on_texts(text_sequences)

**
texts_to_sequences Transforms each text in texts to a sequence of integers. So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.

In [136]:
sequences = kToken.texts_to_sequences(text_sequences)

In [137]:
sequences

[[288,
  31,
  19,
  204,
  74,
  4,
  15,
  1169,
  158,
  7,
  489,
  203,
  73,
  22,
  41,
  156,
  17,
  26,
  1165,
  9,
  52,
  1164,
  1162,
  1161,
  203,
  289],
 [31,
  19,
  204,
  74,
  4,
  15,
  1169,
  158,
  7,
  489,
  203,
  73,
  22,
  41,
  156,
  17,
  26,
  1165,
  9,
  52,
  1164,
  1162,
  1161,
  203,
  289,
  1],
 [19,
  204,
  74,
  4,
  15,
  1169,
  158,
  7,
  489,
  203,
  73,
  22,
  41,
  156,
  17,
  26,
  1165,
  9,
  52,
  1164,
  1162,
  1161,
  203,
  289,
  1,
  205],
 [204,
  74,
  4,
  15,
  1169,
  158,
  7,
  489,
  203,
  73,
  22,
  41,
  156,
  17,
  26,
  1165,
  9,
  52,
  1164,
  1162,
  1161,
  203,
  289,
  1,
  205,
  128],
 [74,
  4,
  15,
  1169,
  158,
  7,
  489,
  203,
  73,
  22,
  41,
  156,
  17,
  26,
  1165,
  9,
  52,
  1164,
  1162,
  1161,
  203,
  289,
  1,
  205,
  128,
  129],
 [4,
  15,
  1169,
  158,
  7,
  489,
  203,
  73,
  22,
  41,
  156,
  17,
  26,
  1165,
  9,
  52,
  1164,
  1162,
  1161,
  203,
  289,
  1,

** For each word will be assigned an unique like IDs, check it based on method index_word

In [138]:
kToken.index_word

{1: 'yang',
 2: 'dan',
 3: 'di',
 4: 'saya',
 5: 'kami',
 6: '“',
 7: 'dengan',
 8: '”',
 9: 'tidak',
 10: 'itu',
 11: 'ke',
 12: 'dari',
 13: 'mereka',
 14: 'aku',
 15: 'untuk',
 16: 'caera',
 17: 'paruh',
 18: 'dia',
 19: 'saat',
 20: 'telur',
 21: 'bisa',
 22: 'dalam',
 23: 'ini',
 24: 'akan',
 25: 'anda',
 26: 'tombak',
 27: 'seperti',
 28: 'beak',
 29: 'lain',
 30: 'telah',
 31: 'beberapa',
 32: 'kita',
 33: 'melihat',
 34: 'hanya',
 35: 'regis',
 36: 'lebih',
 37: '‘',
 38: 'tapi',
 39: 'broke',
 40: 'jika',
 41: 'gubuk',
 42: 'tempat',
 43: 'tua',
 44: 'spear',
 45: 'klan',
 46: 'old',
 47: 'tetapi',
 48: 'bawah',
 49: 'hampir',
 50: 'kamu',
 51: 'sebelum',
 52: 'ada',
 53: 'pada',
 54: 'beruang',
 55: 'aether',
 56: 'bahwa',
 57: 'paruhnya',
 58: 'lalu',
 59: 'ketika',
 60: 'membawa',
 61: 'kecil',
 62: 'besar',
 63: 'kembali',
 64: 'adalah',
 65: 'duduk',
 66: 'tanpa',
 67: 'tahu',
 68: 'ingin',
 69: 'karena',
 70: 'meskipun',
 71: 'swiftsure',
 72: 'burung',
 73: 'bagian',
 7

** Example printing for each Unique IDs in sequence[0]

In [139]:
for i in sequences[0]:
    print(f"{kToken.index_word[i]} : {i}")

butuh : 288
beberapa : 31
saat : 19
bagi : 204
mata : 74
saya : 4
untuk : 15
menyesuaikan : 1169
diri : 158
dengan : 7
perubahan : 489
cahaya : 203
bagian : 73
dalam : 22
gubuk : 41
tetua : 156
paruh : 17
tombak : 26
redup : 1165
tidak : 9
ada : 52
penerangan : 1164
kecuali : 1162
kolom : 1161
cahaya : 203
tipis : 289


In [140]:
type(sequences)

list

In [141]:
vocabulary_size = len(kToken.word_counts)

In [142]:
vocabulary_size

1169

## Convert Seq to Numpy

In [143]:
import numpy as np

In [144]:
sequences = np.array(sequences)

In [145]:
sequences

array([[ 288,   31,   19, ..., 1161,  203,  289],
       [  31,   19,  204, ...,  203,  289,    1],
       [  19,  204,   74, ...,  289,    1,  205],
       ...,
       [1160,  354,  409, ...,    6,  490,  491],
       [ 354,  409,    3, ...,  490,  491,    8],
       [ 409,    3,   22, ...,  491,    8,  492]])

## Preprocessing 

** In NLP the X feature is the 24 words and the y / label is 25th word

In [146]:
from keras.utils import to_categorical

** The features

In [147]:
sequences[:,:-1]

array([[ 288,   31,   19, ..., 1162, 1161,  203],
       [  31,   19,  204, ..., 1161,  203,  289],
       [  19,  204,   74, ...,  203,  289,    1],
       ...,
       [1160,  354,  409, ...,  101,    6,  490],
       [ 354,  409,    3, ...,    6,  490,  491],
       [ 409,    3,   22, ...,  490,  491,    8]])

** Labels

In [148]:
sequences[:,-1]

array([289,   1, 205, ..., 491,   8, 492])

### Train Test Split

In [149]:
X = sequences[:,:-1]

In [150]:
y = sequences[:,-1]

In [151]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [152]:
seq_len = X.shape[1]

In [153]:
X.shape

(3842, 25)

## Build LSTM Model

In [154]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [155]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [156]:
model = create_model(vocabulary_size+1,seq_len)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 25)            29250     
_________________________________________________________________
lstm_5 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_6 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_6 (Dense)              (None, 1170)              59670     
Total params: 126,870
Trainable params: 126,870
Non-trainable params: 0
_________________________________________________________________


In [157]:
from pickle import dump, load

In [158]:
model.fit(X,y,batch_size=128,epochs=300,verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 

<keras.callbacks.callbacks.History at 0x1c78c6f5108>

In [159]:
model.save('models.h5')

In [160]:
dump(kToken,open('models.h5','wb'))

## Generate Text

In [161]:
from keras.preprocessing.sequence import pad_sequences

In [162]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

## Generate Text.

**Seed LSTM state.

In [247]:
seed_text = 'Butuh beberapa saat'

In [249]:
generate_text(model,kToken,seq_len,seed_text,num_gen_words=10)

'paruh katanya kami pasti menyimpannya berkeliling ke belakang kami terdiam'

**Since 300 epochs only reach 0.5 accuracy the generated sentences may not related with the context and tend to mix with scramble words.