In [2]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

In [3]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

In [1]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Using TensorFlow backend.


In [4]:
import nltk
import pandas as pd
import numpy as np
import re
from nltk.corpus import gutenberg
from string import punctuation

In [30]:
from sklearn.metrics.pairwise import euclidean_distances

In [5]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [6]:
bible = gutenberg.sents('bible-kjv.txt') 
remove_terms = punctuation + '0123456789'

norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])

Total lines: 30103

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: god said let firmament midst waters let divide waters waters


In [7]:
len(norm_bible)

29251

In [8]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)

In [9]:
word2id = tokenizer.word_index

In [10]:
word2id['PAD'] = 0

In [20]:
word2id

{'shall': 1,
 'unto': 2,
 'lord': 3,
 'thou': 4,
 'thy': 5,
 'god': 6,
 'ye': 7,
 'said': 8,
 'thee': 9,
 'upon': 10,
 'man': 11,
 'israel': 12,
 'king': 13,
 'son': 14,
 'hath': 15,
 'people': 16,
 'came': 17,
 'house': 18,
 'come': 19,
 'one': 20,
 'children': 21,
 'also': 22,
 'day': 23,
 'land': 24,
 'men': 25,
 'shalt': 26,
 'let': 27,
 'go': 28,
 'hand': 29,
 'saying': 30,
 'us': 31,
 'made': 32,
 'even': 33,
 'went': 34,
 'behold': 35,
 'saith': 36,
 'every': 37,
 'therefore': 38,
 'things': 39,
 'father': 40,
 'sons': 41,
 'hast': 42,
 'david': 43,
 'make': 44,
 'say': 45,
 'may': 46,
 'earth': 47,
 'jesus': 48,
 'great': 49,
 'name': 50,
 'thine': 51,
 'away': 52,
 'put': 53,
 'among': 54,
 'thereof': 55,
 'forth': 56,
 'give': 57,
 'neither': 58,
 'take': 59,
 'city': 60,
 'days': 61,
 'brought': 62,
 'moses': 63,
 'two': 64,
 'heart': 65,
 'pass': 66,
 'judah': 67,
 'jerusalem': 68,
 'according': 69,
 'know': 70,
 'took': 71,
 'thus': 72,
 'offering': 73,
 'bring': 74,
 'goo

In [11]:
len(word2id)

12425

In [12]:
# build vocabulary of unique words
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

In [13]:
wids[:10]

[[13, 1154, 5766],
 [154, 2450, 13, 1154, 5766],
 [132, 310, 63, 86, 8480],
 [582, 6, 1180, 94, 47],
 [47, 136, 1883, 1884, 396, 10, 144, 860],
 [111, 6, 759, 10, 144, 212],
 [6, 8, 27, 232, 232],
 [6, 101, 232, 75, 6, 826, 232, 396],
 [6, 86, 232, 23, 396, 86, 197],
 [926, 287, 132, 23]]

In [14]:
vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 12425
Vocabulary Sample: [('shall', 1), ('unto', 2), ('lord', 3), ('thou', 4), ('thy', 5), ('god', 6), ('ye', 7), ('said', 8), ('thee', 9), ('upon', 10)]


In [15]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [16]:
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    print(x)
    if i == 3:
            break
    i += 1

[[   0    0 1154 5766]]
[[   0    0   13 5766]]
[[   0    0   13 1154]]
[[   0    0 2450   13]]


In [17]:
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    print(y)
    if i == 3:
            break
    i += 1

[[0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]]


In [18]:
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['old', 'testament', 'james', 'bible'] -> Target (Y): king
Context (X): ['first', 'book', 'called', 'genesis'] -> Target (Y): moses
Context (X): ['beginning', 'god', 'heaven', 'earth'] -> Target (Y): created
Context (X): ['earth', 'without', 'void', 'darkness'] -> Target (Y): form
Context (X): ['without', 'form', 'darkness', 'upon'] -> Target (Y): void
Context (X): ['form', 'void', 'upon', 'face'] -> Target (Y): darkness
Context (X): ['void', 'darkness', 'face', 'deep'] -> Target (Y): upon
Context (X): ['spirit', 'god', 'upon', 'face'] -> Target (Y): moved
Context (X): ['god', 'moved', 'face', 'waters'] -> Target (Y): upon
Context (X): ['god', 'said', 'light', 'light'] -> Target (Y): let
Context (X): ['god', 'saw', 'good', 'god'] -> Target (Y): light


In [19]:
# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1242500   
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12425)             1254925   
Total params: 2,497,425
Trainable params: 2,497,425
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
# visualize model structure
# SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, 
#                  rankdir='TB').create(prog='dot', format='svg'))

In [27]:
for epoch in range(1, 2):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 1 	Loss: 4099958.122513528



In [29]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(12424, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
unto,-0.461066,-0.964601,0.436668,0.202142,-0.97475,0.839819,0.238647,-0.025719,0.822244,-0.875568,...,-0.839469,-0.533434,-1.075048,-0.882867,-0.769153,-0.420639,-0.253642,-0.645419,-0.892402,0.816119
lord,0.473712,-0.576151,0.14733,-0.013239,-1.615158,0.074599,0.224253,0.004996,0.455954,-0.17691,...,-0.386032,-0.054538,-1.606789,-0.229587,-0.40069,-0.40105,0.050411,0.225033,-0.365109,0.638631
thou,0.454939,-1.020027,0.196836,-0.788999,-0.453954,0.630345,0.370279,-0.3527,1.001256,-0.11228,...,-0.221915,-1.081493,-0.08998,-1.151293,-0.800675,-1.134245,-0.702329,-0.445486,-1.456105,0.829594
thy,-0.859935,-0.301802,-0.030513,0.179587,-0.960252,0.539878,0.021657,0.224857,0.945117,-0.237649,...,-0.738436,-0.053617,-0.949343,0.151139,-0.036391,-0.823198,-0.390817,-0.406609,-0.438572,0.788355
god,-0.117875,-0.172774,0.200754,-0.635625,-0.359195,0.023573,0.442141,-0.008112,0.531654,-0.720741,...,-0.782094,-0.40842,-0.605109,-0.507339,-0.609831,-0.057626,0.230168,-0.205688,-0.547646,0.375396


In [31]:
# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['god', 'jesus', 'noah', 'egypt', 'john', 'gospel', 'moses','famine']}

similar_words

(12424, 12424)


{'god': ['also', 'things', 'therefore', 'might', 'even'],
 'jesus': ['christ', 'peter', 'john', 'disciples', 'pharisees'],
 'noah': ['enoch', 'sixty', 'mahalaleel', 'methuselah', 'ninety'],
 'egypt': ['wilderness', 'numbered', 'war', 'back', 'poor'],
 'john': ['peter', 'ship', 'pharisees', 'others', 'galilee'],
 'gospel': ['churches', 'church', 'preached', 'timotheus', 'resurrection'],
 'moses': ['loved', 'concerning', 'sought', 'died', 'gone'],
 'famine': ['strangers', 'empty', 'least', 'wonders', 'divided']}