In [1]:
import keras
import re
import nltk
import numpy as np
from keras import preprocessing
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import SeparableConv1D, MaxPooling1D
from keras.layers import LSTM, GRU
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Flatten
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint


keras.__version__

Using TensorFlow backend.


'2.2.4'

# Data Loading

In [2]:
trainFromTextFile = "train.FROM"
trainToTextFile   = "train.TO"
trainFromText     = open(trainFromTextFile, 'r', encoding='utf-8').read().lower()
trainToText       = open(trainToTextFile, 'r', encoding='utf-8').read().lower()
trainFromSentenceTokens = re.split('\n', trainFromText)
trainToSentenceTokens   = re.split('\n', trainToText)
trainFromWordTokens = re.split(' |\n', trainFromText)
trainToWordTokens   = re.split(' |\n', trainToText)
print(trainFromSentenceTokens[1])
print(trainToSentenceTokens[1])
print('Found %s sentences from TrainFrom Text' %len(trainFromSentenceTokens))
print('Found %s sentences from TrainTo Text' %len(trainToSentenceTokens))
print('Found %s words from TrainFrom Text' %len(trainFromWordTokens))
print('Found %s words from TrainTo Text' %len(trainToWordTokens))

is it me, or does 'frank moss' look like a child molester?
to me, it looks more like he 'takes a wide stance' on public restroom sex.
Found 29620 sentences from TrainFrom Text
Found 29620 sentences from TrainTo Text
Found 521666 words from TrainFrom Text
Found 479824 words from TrainTo Text


# Tokenization

In [3]:
train_len = 100    # We will cut reviews after 100 words
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizerData = Tokenizer(num_words=max_words)
tokenizerData.fit_on_texts(trainFromSentenceTokens)
sequences = tokenizerData.texts_to_sequences(trainFromSentenceTokens)

sequences = pad_sequences(sequences, maxlen=train_len)  #Pad so all the arrays are the same size

Dataword_index = tokenizerData.word_index
Dataword_count = tokenizerData.word_counts
nWordsData     = len(tokenizerData.word_counts) + 1


print('Found %s words.' %len(trainFromWordTokens))
print('Found %s sentences.' %len(trainFromSentenceTokens))
print('Found %s.' %len(sequences))
print('Found %s unique tokens.' % len(Dataword_index))
print('Found %s unique words.' % len(Dataword_count))

Found 521666 words.
Found 29620 sentences.
Found 29620.
Found 34823 unique tokens.
Found 34823 unique words.


In [4]:
tokenizerTarget = Tokenizer(num_words=max_words)
tokenizerTarget.fit_on_texts(trainToSentenceTokens)
sequencesTarget = tokenizerTarget.texts_to_sequences(trainToSentenceTokens)

sequencesTarget = pad_sequences(sequencesTarget, maxlen=train_len)  #Pad so all the arrays are the same size

Targetword_index = tokenizerTarget.word_index
Targetword_count = tokenizerTarget.word_counts
nWordsTarget     = len(tokenizerTarget.word_counts) + 1


print('Found %s words.' %len(trainToWordTokens))
print('Found %s sentences.' %len(trainToSentenceTokens))
print('Found %s.' %len(sequencesTarget))
print('Found %s unique tokens.' % len(Targetword_index))
print('Found %s unique words.' % len(Targetword_count))

Found 479824 words.
Found 29620 sentences.
Found 29620.
Found 34369 unique tokens.
Found 34369 unique words.


In [5]:
train_data = sequences
train_target = sequencesTarget

train_data.shape

(29620, 100)

In [6]:
train_target.shape

(29620, 100)

In [8]:
train_target[2]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   15,   22,   13, 1086,
       5207,    5,   65,  574, 5892, 1388, 5208,    6,  206,  615,    3,
        292,   13, 1086, 1439,    1,  574, 1439,  335,  206,  615,    3,
        292,  699, 6846,   92,   62,    1,  574,  307,   20,  270,   13,
        574, 5207,    5,  103,   46,   53, 2008,  402, 3125, 2496,    4,
       1880])

In [9]:
seq_len = train_data.shape[1]

In [10]:
seq_len

100

# Embedding Layer

In [11]:
glove_dir = r'C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lab Exercises\Machine Learning Projects\glove.6B\glove.6B.300d.txt'

embeddings_index = {}
f = open(glove_dir, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [12]:
# The Embedding layer takes at least two arguments:
# the number of possible tokens, the maximum word index,
# and the dimensionality of the embeddings, here 300.
embedding_dim = 300

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in Dataword_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

# Model Creation

In [13]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=seq_len))
model.add(SeparableConv1D(32, 9, activation='relu'))
model.add(MaxPooling1D(1))
model.add(SeparableConv1D(32, 1, activation='relu'))
model.add(Bidirectional(GRU(32, dropout=0.1, recurrent_dropout=0.5, return_sequences=True)))
model.add(GRU(32, dropout=0.1, recurrent_dropout=0.5,return_sequences=True))
model.add(GRU(32, dropout=0.1, recurrent_dropout=0.5))
model.add(Dense(seq_len, activation='softmax'))
opt_adam = optimizers.adam(lr=0.001)
model.compile(loss='categorical_crossentropy',optimizer=opt_adam,metrics=['accuracy'])
model.summary()
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
separable_conv1d_1 (Separabl (None, 92, 32)            12332     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 92, 32)            0         
_________________________________________________________________
separable_conv1d_2 (Separabl (None, 92, 32)            1088      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 92, 64)            12480     
_________________________________________________________________
gru_2 (GRU)                  (None, 92, 32)            9312      
____________________

In [14]:
from pickle import dump
from pickle import load
from keras.models import load_model
path = r'C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model6.h5'
checkpoint = ModelCheckpoint(path, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(train_target,train_data,batch_size=32,epochs=10,callbacks=[checkpoint])
model.save('word_pred_Model6.h5')
dump(tokenizer,open('tokenizer_Model6','wb')) 

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  'Discrepancy between trainable weights and collected trainable'




Epoch 1/10

Epoch 00001: loss improved from inf to 46284.44334, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model6.h5
Epoch 2/10

Epoch 00002: loss improved from 46284.44334 to 45726.82571, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model6.h5
Epoch 3/10

Epoch 00003: loss did not improve from 45726.82571
Epoch 4/10

Epoch 00004: loss did not improve from 45726.82571
Epoch 5/10

Epoch 00005: loss improved from 45726.82571 to 45715.77208, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model6.h5
Epoch 6/10

KeyboardInterrupt: 

In [None]:
model.fit(train_target,train_data,batch_size=32,epochs=10,callbacks=[checkpoint])

In [None]:
train_inputs = n_sequences[:,:-1]
train_targets = n_sequences[:,-1]

train_targets = to_categorical(train_targets, num_classes=vocabulary_size+1)
seq_len = train_inputs.shape[1]
train_inputs.shape