In [10]:
import keras
import re
import numpy as np
from keras import preprocessing
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import SeparableConv1D, MaxPooling1D
from keras.layers import LSTM, GRU
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Flatten
from keras.callbacks import ModelCheckpoint


keras.__version__

'2.2.4'

In [11]:
# load ascii text and covert to lowercase
filename = "GreatGatsby.txt"
text = open(filename, 'r', encoding='utf-8').read().lower()
tokens = text.split(' ')
tokens = re.split(' |\n', text)


train_len = 10
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

print('Found %s words.' %len(tokens))
print('Found %s sequences.' %len(text_sequences))

Found 50294 words.
Found 50284 sequences.


In [12]:
text_sequences[5]

['younger',
 'and',
 'more',
 'vulnerable',
 'years',
 'my',
 'father',
 'gave',
 'me',
 'some']

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

data = pad_sequences(sequences, maxlen=train_len)

word_index = tokenizer.word_index
word_count = tokenizer.word_counts
nWords     = len(tokenizer.word_counts) + 1

n_sequences = np.empty([len(sequences),train_len], dtype='int32')
for i in range(len(sequences)):
    n_sequences[i] = sequences[i]

    
print('Found %s unique tokens.' % len(word_index))
print('Found %s unique words.' % len(word_count))


Found 9049 unique tokens.
Found 9049 unique words.


In [14]:
from keras.utils import to_categorical

train_inputs = n_sequences[:,:-1]
train_targets = n_sequences[:,-1]

train_targets = to_categorical(train_targets, num_classes=len(word_count)+1)
seq_len = train_inputs.shape[1]
train_inputs.shape

(50284, 9)

In [15]:
train_targets.shape

(50284, 9050)

In [6]:
glove_dir = r'C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lab Exercises\Machine Learning Projects\glove.6B\glove.6B.300d.txt'

embeddings_index = {}
f = open(glove_dir, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [7]:
# The Embedding layer takes at least two arguments:
# the number of possible tokens, the maximum word index,
# and the dimensionality of the embeddings, here 300.
embedding_dim = 300

embedding_matrix = np.zeros((nWords, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < nWords:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

model = Sequential()
model.add(Embedding(nWords, embedding_dim, input_length=seq_len))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50,activation='relu'))
model.add(Dense(nWords,activation='softmax'))
opt_adam = optimizers.adam(lr=0.001)
model.compile(loss='categorical_crossentropy',optimizer=opt_adam,metrics=['accuracy'])
model.summary()
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [None]:
model = Sequential()
model.add(Embedding(nWords, embedding_dim, input_length=seq_len))
model.add(SeparableConv1D(32, 9, activation='relu'))
model.add(MaxPooling1D(1))
model.add(SeparableConv1D(32, 1, activation='relu'))
model.add(Bidirectional(GRU(32, dropout=0.1, recurrent_dropout=0.5, return_sequences=True)))
model.add(GRU(32, dropout=0.1, recurrent_dropout=0.5,return_sequences=True))
model.add(GRU(32, dropout=0.1, recurrent_dropout=0.5))
model.add(Dense(nWords,activation='softmax'))
opt_adam = optimizers.adam(lr=0.001)
model.compile(loss='categorical_crossentropy',optimizer=opt_adam,metrics=['accuracy'])
model.summary()
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

#model.add(Dropout(0.2))



In [9]:
from pickle import dump
from pickle import load
from keras.models import load_model
path = r'C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5'
checkpoint = ModelCheckpoint(path, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(train_inputs,train_targets,batch_size=32,epochs=10,verbose=1,callbacks=[checkpoint])
model.save('word_pred_Model4.h5')
dump(tokenizer,open('tokenizer_Model4','wb')) 

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  'Discrepancy between trainable weights and collected trainable'




Epoch 1/10

Epoch 00001: loss improved from inf to 7.25602, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5
Epoch 2/10

Epoch 00002: loss improved from 7.25602 to 6.78342, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5
Epoch 3/10

Epoch 00003: loss improved from 6.78342 to 6.51140, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5
Epoch 4/10

Epoch 00004: loss improved from 6.51140 to 6.25164, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5
Epoch 5/10

Epoch 00005: loss improved from 6.25164 to 6.01565, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5
Epoch 6/10

Epoch 00006: loss improved from 6.01565 to 5.79666, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5
Epoch 7/10

Epoch 00007: loss improved from 5.79

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
    
sampled = encoded_text?
preds = model.predict(sampled, verbose=0)[0]
next_index = sample(preds, temperature)
next_char = chars[next_index]

generated_text += next_char
generated_text = generated_text[1:]

In [10]:
def gen_sequence(model, tokenizer, seq_len, seed_text, n_words):
    output_text = []
    input_text = seed_text
    
    for i in range(n_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        encoded_text = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        prediction   = model.predict_classes(encoded_text, verbose=0)[0]
        
        pred_word    = tokenizer.index_word[prediction]
        input_text   += ' '+ pred_word
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [16]:
model = load_model('word_pred_Model4.h5')
tokenizer = load(open('tokenizer_Model4','rb'))
print('\n\n===>Enter --exit to exit from the program')
while True:
    seed_text  = input('Enter string: ')
    if seed_text.lower() == '--exit':
        break
    else:
        out = gen_sequence(model, tokenizer, seq_len=seq_len, seed_text=seed_text, n_words=3)
        print('Output: '+' '+out)

NameError: name 'load_model' is not defined

In [None]:
tokens[0:50]