<a href="https://colab.research.google.com/github/oyyarko/Text-Generation/blob/master/text_generation_bidirectional.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content/drive/My Drive/Deep Learning/Text Generation

/content/drive/My Drive/Deep Learning/Text Generation


# **0: Import libraries**

In [None]:
from __future__ import print_function

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Bidirectional
from keras.layers import LSTM, Input, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy

import spacy
nlp = spacy.load('en')
nlp.max_length = 4192709
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle

Using TensorFlow backend.


# **1:Define hyperparameters**

*   Load data



In [None]:
data_dir = 'text_generation_data'
save_dir = 'save'
file_list = ["10", "11", "12", "13", "14", "15", "17", "19", "20", "21"]
vocab_file = os.path.join(save_dir, "words_vocab.pkl")
sequences_step = 1
seq_length = 30

# **2:Read Data**

*   spacy to tokenize
*   convert into small letters
*   remove single characters, numbers, whitespaces, stopwords 



In [None]:
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n", "\n\n", '\u2009', '\xa0'):
            wl.append(word.text.lower())
    return wl

In [None]:
from tqdm import tqdm
wordlist = []

for file_name in tqdm(file_list):
    input_file = os.path.join(data_dir, file_name + ".txt")

    with codecs.open(input_file, 'r') as f:
        data = f.read()
    
    doc = nlp(data)
    wl = create_wordlist(doc)
    wordlist = wordlist + wl

100%|██████████| 10/10 [02:57<00:00, 17.76s/it]


In [None]:
wordlist

['project',
 'gutenberg',
 "'s",
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 ',',
 'by',
 'arthur',
 'conan',
 'doyle',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '.',
 ' ',
 'you',
 'may',
 'copy',
 'it',
 ',',
 'give',
 'it',
 'away',
 'or',
 're',
 '-',
 'use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'project',
 'gutenberg',
 'license',
 'included',
 'with',
 'this',
 'ebook',
 'or',
 'online',
 'at',
 'www.gutenberg.net',
 '\n\n\n',
 'title',
 ':',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'author',
 ':',
 'arthur',
 'conan',
 'doyle',
 'release',
 'date',
 ':',
 'november',
 '29',
 ',',
 '2002',
 '[',
 'ebook',
 '#',
 '1661',
 ']',
 'last',
 'updated',
 ':',
 'may',
 '20',
 ',',
 '2019',
 'language',
 ':',
 'english',
 'character',
 'set',
 'encoding',
 ':',
 'utf-8',
 '*',
 '*',
 '*',
 'start',
 'of',
 'this',
 'projec

# **Create Dictionary**
*   For each word with it's index

In [None]:
#count the number of words
word_counts = collections.Counter(wordlist)

#mapping index to words
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

#mapping words to index into dict
vocab = {x: i for i,x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

#save words into pickle file

with open(os.path.join(vocab_file), "wb") as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

vocab size:  40823


# **Create Sequences**



In [None]:
#create sequences
sequences = []
next_words = []

for i in range(0, len(wordlist)-seq_length, sequences_step):
    sequences.append(wordlist[i: i+seq_length])
    next_words.append(wordlist[i+seq_length])

print("nb sequences: ", len(sequences))

nb sequences:  1294250


In [None]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)

for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

# **Build Model**

In [None]:
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.3))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    return model

In [None]:
rnn_size = 256
batch_size = 32
num_epochs = 50
learning_rate = 0.001

In [None]:
model = bidirectional_lstm_model(seq_length, vocab_size)
model.summary()

# **Train model**

In [None]:
from keras.callbacks import ModelCheckpoint
import tensorflow

In [None]:
callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences_lstm.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
history = model.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)