In [23]:
import tensorflow.keras as keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import tensorflow as tf
import tensorflow.keras.callbacks as callbacks
from tensorflow.keras.utils import plot_model 


from nltk import word_tokenize, sent_tokenize
from tqdm import tqdm 
import numpy as np
import re
import string

In [24]:
path_to_file = 'data/text.txt'

In [25]:
with open(path_to_file) as f:
    corpus = f.read().splitlines()
len(corpus)

2914

### Preprocessing 

In [26]:
punctuation_string = '!"\\\\#\\$%\\&\\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~\d'

In [27]:
def preprocess(corpus):
    sentences = sent_tokenize(' '.join(w for w in corpus))
    sentences = [sent.lower() for sent in sentences]
    sentences = [re.sub('<\w+>', '', sent) for sent in sentences]
    sentences = [sent.decode() for sent in 
                 tf.strings.regex_replace(sentences,f'[{punctuation_string}]', '').numpy()]
    
    tokens = [word_tokenize(sent) for sent in tqdm(sentences)]
    
    # return tokenized sentences
    return tokens

In [32]:
tokens_list = preprocess(corpus[:300])


  0%|          | 0/8750 [00:00<?, ?it/s][A
  6%|▋         | 555/8750 [00:00<00:01, 5546.34it/s][A
 13%|█▎        | 1110/8750 [00:00<00:01, 5171.23it/s][A
 19%|█▉        | 1681/8750 [00:00<00:01, 5404.75it/s][A
 25%|██▌       | 2224/8750 [00:00<00:01, 5149.18it/s][A
 32%|███▏      | 2814/8750 [00:00<00:01, 5406.45it/s][A
 39%|███▉      | 3412/8750 [00:00<00:00, 5594.28it/s][A
 45%|████▌     | 3974/8750 [00:00<00:00, 5400.57it/s][A
 52%|█████▏    | 4517/8750 [00:00<00:00, 4806.09it/s][A
 58%|█████▊    | 5058/8750 [00:00<00:00, 4971.51it/s][A
 64%|██████▍   | 5613/8750 [00:01<00:00, 5135.03it/s][A
 71%|███████   | 6178/8750 [00:01<00:00, 5282.74it/s][A
 77%|███████▋  | 6728/8750 [00:01<00:00, 5344.93it/s][A
 83%|████████▎ | 7304/8750 [00:01<00:00, 5465.90it/s][A
 90%|█████████ | 7878/8750 [00:01<00:00, 5546.88it/s][A
100%|██████████| 8750/8750 [00:01<00:00, 5331.82it/s][A


In [33]:
len(tokens_list)

8750

In [34]:
print(tokens_list[0])

['sol', 'yurick', 'the', 'writer', 'whose', 'novel', 'the', 'warriors', 'was', 'adapted', 'into', 'a', 'film', 'years', 'later', 'which', 'then', 'became', 'one', 'of', 'the', 'best', 'adapted', 'works', 'ever', 'in', 'video', 'gaming', 'died', 'this', 'weekend']


In [35]:
def build_vocabulary(tokens_list):
    stoi = {}
    itos = {}
    idx = 0
    
    stoi['<pad>'] = idx
    itos[idx] = '<pad>'
    idx += 1
    
    for tokens in tqdm(tokens_list):
        for word in tokens:
            if word not in stoi:
                stoi[word] = idx
                itos[idx] = word
                idx += 1
    return stoi, itos

In [36]:
# remove '
tokens_list
stoi, itos = build_vocabulary(tokens_list)


100%|██████████| 8750/8750 [00:00<00:00, 219513.01it/s]


In [37]:
numericalized = [[stoi[w] for w in sent] for sent in tokens_list]

In [38]:
vocab_size = len(stoi)
embed_size = 100
window_size = 2

print(f"Vocab Size  : {vocab_size}")
print(f"Vocab Sample: {list(stoi.items())[:5]}")

Vocab Size  : 18856
Vocab Sample: [('<pad>', 0), ('sol', 1), ('yurick', 2), ('the', 3), ('writer', 4)]


In [44]:
stoi['king'], stoi['queen'], stoi['prince'], stoi['dog'] 

(2216, 3215, 9189, 2524)

In [15]:
def generate_context_target_pairs(corpus=numericalized, window_size=2, vocab_size=vocab_size):
    
    context_length = window_size * 2
    
    for sentence in corpus:
        sentence_length = len(sentence)
        
        for index, word in enumerate(sentence):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sentence[i] for i in range(start, end)
                                 if 0 <= i < sentence_length and i != index])
            label_word.append(word)
            
            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield(x[0], y[0])

In [16]:
i = 0
for x, y in generate_context_target_pairs(numericalized, 2, vocab_size):
    if 0 not in x:
        if i == 10: break
        i+=1

        print(f'Target: {itos[np.argmax(y)]}, Context: {[itos[w] for w in x]}')

Target: before, Context: ['first', 'citizen', 'we', 'proceed']
Target: we, Context: ['citizen', 'before', 'proceed', 'any']
Target: proceed, Context: ['before', 'we', 'any', 'further']
Target: any, Context: ['we', 'proceed', 'further', 'hear']
Target: further, Context: ['proceed', 'any', 'hear', 'me']
Target: hear, Context: ['any', 'further', 'me', 'speak']
Target: you, Context: ['first', 'citizen', 'are', 'all']
Target: are, Context: ['citizen', 'you', 'all', 'resolved']
Target: all, Context: ['you', 'are', 'resolved', 'rather']
Target: resolved, Context: ['are', 'all', 'rather', 'to']


In [17]:
output_shapes = ([window_size*2],[vocab_size])
dataset = tf.data.Dataset.from_generator(generate_context_target_pairs, (tf.int16, tf.int16), output_shapes=output_shapes)
dataset = dataset.batch(100)

In [18]:
for batch, (x, y) in enumerate(dataset):
    break
print("batch: ", batch)
x.shape, y.shape

batch:  0


(TensorShape([100, 4]), TensorShape([100, 12482]))

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2),
    Lambda(lambda x: keras.backend.mean(x, axis=1), output_shape=(embed_size)),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 100)            1248200   
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 12482)             1260682   
Total params: 2,508,882
Trainable params: 2,508,882
Non-trainable params: 0
_________________________________________________________________


In [21]:
plot_model(model,show_shapes=True, show_layer_names=True, rankdir='TB', expand_nested=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [23]:
google_drive_path = 'models/shakespeare_local.h5'

In [25]:
early_stopping_cb = callbacks.EarlyStopping(monitor='loss', patience=10, verbose=True)
checkpoint_cb = callbacks.ModelCheckpoint(google_drive_path, monitor='loss', verbose=True, save_best_only=True)
tensorboard_callback = callbacks.TensorBoard(log_dir="logs")

callbacks_list = [early_stopping_cb, checkpoint_cb, tensorboard_callback]

history = model.fit(dataset, epochs=1, callbacks=callbacks_list)


Epoch 00001: loss improved from inf to 6.39122, saving model to models/shakespeare_local.h5


In [21]:
stoi['king']

2369

In [22]:
stoi['queen']

3633

In [20]:
stoi['citizen']

2

In [None]:
`