In [14]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline

In [15]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beautiful today,weather
7,The dog is lazy but the brown fox is quick!,animals


#Preprocessing

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
norm_corpus=list(map(normalize_document,corpus))
norm_corpus

['sky blue beautiful',
 'love blue beautiful sky',
 'quick brown fox jumps lazy dog',
 'kings breakfast sausages ham bacon eggs toast beans',
 'love green eggs ham sausages bacon',
 'brown fox quick blue dog lazy',
 'sky blue sky beautiful today',
 'dog lazy brown fox quick']

In [18]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'kings breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U51')

In [19]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [20]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
from nltk.corpus import gutenberg
from string import punctuation

bible = gutenberg.sents('bible-kjv.txt') 
remove_terms = punctuation + '0123456789'

norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible)) #If function is None, the identity function is assumed, that is, all elements of iterable that are false are removed.
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])

Total lines: 30103

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: god said let firmament midst waters let divide waters waters


#Word2Vec : CBOW

##1.Build the corpus vocab

In [22]:
vocabulary=[word for sent in norm_bible for word in sent.split()]
vocabulary

['king',
 'james',
 'bible',
 'old',
 'testament',
 'king',
 'james',
 'bible',
 'first',
 'book',
 'moses',
 'called',
 'genesis',
 'beginning',
 'god',
 'created',
 'heaven',
 'earth',
 'earth',
 'without',
 'form',
 'void',
 'darkness',
 'upon',
 'face',
 'deep',
 'spirit',
 'god',
 'moved',
 'upon',
 'face',
 'waters',
 'god',
 'said',
 'let',
 'light',
 'light',
 'god',
 'saw',
 'light',
 'good',
 'god',
 'divided',
 'light',
 'darkness',
 'god',
 'called',
 'light',
 'day',
 'darkness',
 'called',
 'night',
 'evening',
 'morning',
 'first',
 'day',
 'god',
 'said',
 'let',
 'firmament',
 'midst',
 'waters',
 'let',
 'divide',
 'waters',
 'waters',
 'god',
 'made',
 'firmament',
 'divided',
 'waters',
 'firmament',
 'waters',
 'firmament',
 'god',
 'called',
 'firmament',
 'heaven',
 'evening',
 'morning',
 'second',
 'day',
 'god',
 'said',
 'let',
 'waters',
 'heaven',
 'gathered',
 'together',
 'unto',
 'one',
 'place',
 'let',
 'dry',
 'land',
 'appear',
 'god',
 'called',
 'd

In [23]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index
print(word2id)

# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
print(wids)

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

[[13, 1154, 5766], [154, 2450, 13, 1154, 5766], [132, 310, 63, 86, 8480], [582, 6, 1180, 94, 47], [47, 136, 1883, 1884, 396, 10, 144, 860], [111, 6, 759, 10, 144, 212], [6, 8, 27, 232, 232], [6, 101, 232, 75, 6, 826, 232, 396], [6, 86, 232, 23, 396, 86, 197], [926, 287, 132, 23], [6, 8, 27, 2351, 161, 212, 27, 1096, 212, 212], [6, 32, 2351, 826, 212, 2351, 212, 2351], [6, 86, 2351, 94], [926, 287, 363, 23], [6, 8, 27, 212, 94, 237, 117, 2, 20, 76, 27, 796, 24, 1011], [6, 86, 796, 24, 47, 3075, 117, 212, 86, 1827, 6, 101, 75], [6, 8, 27, 47, 74, 56, 891, 2199, 3942, 223, 308, 318, 3942, 308, 1181, 193, 223, 10, 47], [47, 62, 56, 891, 2199, 3942, 223, 1181, 318, 3942, 308, 193, 223, 1181, 6, 101, 75], [926, 287, 343, 23], [6, 8, 27, 3235, 2351, 94, 1096, 23, 197, 27, 1039, 2930, 61, 106, 27, 3235, 2351, 94, 57, 232, 10, 47], [6, 32, 64, 49, 3235, 740, 232, 845, 23, 5767, 232, 845, 197, 32, 1055, 22], [6, 78, 2351, 94, 57, 232, 10, 47, 845, 23, 197, 1096, 232, 396, 6, 101, 75], [926, 287,

##2.Build a CBOW(context,target) generator

In [24]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] for i in range(start, end) if 0 <= i < sentence_length and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)
            
            
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
    if i == 10:
      break
    i += 1

Context (X): ['PAD', 'PAD', 'james', 'bible'] -> Target (Y): king
Context (X): ['PAD', 'PAD', 'king', 'bible'] -> Target (Y): james
Context (X): ['PAD', 'PAD', 'king', 'james'] -> Target (Y): bible
Context (X): ['PAD', 'PAD', 'testament', 'king'] -> Target (Y): old
Context (X): ['PAD', 'old', 'king', 'james'] -> Target (Y): testament
Context (X): ['old', 'testament', 'james', 'bible'] -> Target (Y): king
Context (X): ['PAD', 'testament', 'king', 'bible'] -> Target (Y): james
Context (X): ['PAD', 'PAD', 'king', 'james'] -> Target (Y): bible
Context (X): ['PAD', 'PAD', 'book', 'moses'] -> Target (Y): first
Context (X): ['PAD', 'first', 'moses', 'called'] -> Target (Y): book
Context (X): ['first', 'book', 'called', 'genesis'] -> Target (Y): moses


##3.Build the CBOW model architecture

In [28]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))#each word from our vocab will have 4 context words
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))

cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())



Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1242500   
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12425)             1254925   
Total params: 2,497,425
Trainable params: 2,497,425
Non-trainable params: 0
_________________________________________________________________
None


##4.Train the model

In [None]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()