In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import tensorflow.keras as keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import tensorflow as tf

import numpy as np
import re
import string

In [4]:
path_to_file = keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [5]:
with open(path_to_file) as f:
    corpus = f.read().splitlines()
len(corpus)

40000

In [6]:
corpus[:5]

['First Citizen:',
 'Before we proceed any further, hear me speak.',
 '',
 'All:',
 'Speak, speak.']

In [7]:
corpus = [line for line in corpus if line != '']

In [8]:
corpus_text = text_to_word_sequence(' '.join(i for i in corpus))

In [9]:
corpus[:10]

['First Citizen:',
 'Before we proceed any further, hear me speak.',
 'All:',
 'Speak, speak.',
 'First Citizen:',
 'You are all resolved rather to die than to famish?',
 'All:',
 'Resolved. resolved.',
 'First Citizen:',
 'First, you know Caius Marcius is chief enemy to the people.']

In [10]:
corpus_text[:5]

['first', 'citizen', 'before', 'we', 'proceed']

In [11]:
tokenizer = Tokenizer()

In [12]:
tokenizer.fit_on_texts(corpus_text)

In [13]:
stoi = tokenizer.word_index
stoi['<pad>'] = 0
itos = {i:s for s,i in stoi.items()}

In [14]:
numericalized = [[stoi[w] for w in text_to_word_sequence(doc)] for doc in corpus]

In [15]:
vocab_size = len(stoi)
embed_size = 100
window_size = 2

print(f"Vocab Size  : {vocab_size}")
print(f"Vocab Sample: {list(stoi.items())[:5]}")

Vocab Size  : 12633
Vocab Sample: [('the', 1), ('and', 2), ('to', 3), ('i', 4), ('of', 5)]


In [16]:
def generate_context_target_paris(corpus, window_size, vocab_size):
    
    context_length = window_size * 2
    
    for sentence in corpus:
        sentence_length = len(sentence)
        
        for index, word in enumerate(sentence):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sentence[i] for i in range(start, end)
                                 if 0 <= i < sentence_length and i != index])
            label_word.append(word)
            
            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield(x, y)

In [17]:
i = 0
for x, y in generate_context_target_paris(numericalized, 2, vocab_size):
    if 0 not in x[0]:
        if i == 10: break
        i+=1

        print(f'Target: {itos[np.argmax(y)]}, Context: {[itos[w] for w in x[0]]}')

Target: proceed, Context: ['before', 'we', 'any', 'further']
Target: any, Context: ['we', 'proceed', 'further', 'hear']
Target: further, Context: ['proceed', 'any', 'hear', 'me']
Target: hear, Context: ['any', 'further', 'me', 'speak']
Target: all, Context: ['you', 'are', 'resolved', 'rather']
Target: resolved, Context: ['are', 'all', 'rather', 'to']
Target: rather, Context: ['all', 'resolved', 'to', 'die']
Target: to, Context: ['resolved', 'rather', 'die', 'than']
Target: die, Context: ['rather', 'to', 'than', 'to']
Target: than, Context: ['to', 'die', 'to', 'famish']


In [17]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2),
    Lambda(lambda x: keras.backend.mean(x, axis=1), output_shape=(embed_size,)),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 100)            1263300   
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 12633)             1275933   
Total params: 2,539,233
Trainable params: 2,539,233
Non-trainable params: 0
_________________________________________________________________


In [None]:
x, y = np.narray

In [26]:
from tensorflow.keras.utils import plot_model 
plot_model(model,show_shapes=True, show_layer_names=True, rankdir='TB', expand_nested=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')
