In [9]:
!ls ~/workspace/ipython_notebooks/data

GoogleNews-vectors-negative300.bin [1m[36mproduct_research[m[m
[1m[36mcats_n_dogs[m[m                        [1m[36msentiment_labelled_sentences[m[m
housing.data.txt                   [1m[36mstocks[m[m


In [10]:
from gensim.models import KeyedVectors
google_news_vectors_file = '~/workspace/ipython_notebooks/data/GoogleNews-vectors-negative300.bin'

# this loads pre-trained word embeddings model - based on Google News corpus - this is a 3.4G file
word2vec = KeyedVectors.load_word2vec_format(google_news_vectors_file, binary=True)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def compare(a, b):
    to_vec = lambda w: word2vec.word_vec(w).reshape(1,-1)
    return cosine_similarity(to_vec(a),to_vec(b)).sum() 

In [16]:
compare('king', 'queen')

0.65109581

In [18]:
import numpy as np

#outputs the average word2vec for words in this sentence
def average_vec(sentence):
    words = sentence.split()
    word_vecs = [word2vec.word_vec(w) for w in words]
    return (np.array(word_vecs).sum(axis=0)/len(word_vecs)).reshape(1,-1)

compare = lambda a,b: cosine_similarity(average_vec(a),average_vec(b)).sum()

print compare('Quick fox jumps over dog','Fast fox jumps over puppy')
# 0.89086312 - basically the same

print compare('Quick fox jumps over dog','Fast animal jumps over another one')
# 0.71557522 - quite similar

print compare('Fruit fell from the tree','An apple has fallen')
# 0.51716453 - still significant similarity - even though there is not a single shared word

print compare('Quick fox jumps over dog','The judge entered the courtroom')

0.890863
0.715575
0.517165
0.199475


In [19]:
word2vec.most_similar(positive=['king','woman'],negative=['man'],topn=1)

[(u'queen', 0.7118192315101624)]

In [20]:
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding
from keras.models import Model
import numpy as np

# first we need to tokenize our text, turning sequence of words into
# sequence of numbers

vocabulary_size = 10000 # number of supported words, size of our vocabulary
MAX_SENTENCE_LENGTH = 10

texts = ['This is our super simple',
         'Corpus of texts we will process',
         'IRL you would load this data from somewhere'
        ]

tokenizer = Tokenizer(vocabulary_size)
tokenizer.fit_on_texts(texts) # we fit tokenizer on texts we will process
sequences = tokenizer.texts_to_sequences(texts) # here the conversion to tokens happens

word_index = tokenizer.word_index

# let's pad these sequences so all have equal size
data = pad_sequences(sequences, maxlen=MAX_SENTENCE_LENGTH)

# let's use pre-trained word2vec again
word2vec = KeyedVectors.load_word2vec_format('../input/GoogleNews-vectors-negative300.bin', \
        binary=True)
VECTOR_DIMENSION = 300

embedding_matrix = np.zeros((vocabulary_size, VECTOR_DIMENSION)) #word to vec - maps word id (from tokenizer) into vector space

# fill this matrix with values from pre-trained word2vec
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)


embedding_layer = Embedding(
        vocabulary_size, # how many words are mapped into vectors
        VECTOR_DIMENSION, # size of output vector dimension (we use pre-trained model with vectors of 300 values)
        weights=[embedding_matrix], # we initialize weight from pre-trained model
        input_length=MAX_SENTENCE_LENGTH, # how many words in the sentence we process
        trainable=False) # we will not update this layer

lstm_output_size = 30
lstm_layer = LSTM(
    lstm_output_size) # number of outputs

sentence_input = Input(shape=(MAX_SENTENCE_LENGTH,), dtype='int32') # the input takes 
embedded_sentence = embedding_layer(sentence_input)
lstm_layer = lstm_layer(embedded_sentence)

# you add all deep layers here - let's say we have a single one
size_of_dense = 10
deep_layer = Dense(
        size_of_dense,
        activation='sigmoid'
    )(lstm_layer)

# and now let's assume we have output layer for binary classification task:
prediction = Dense(1, activation='sigmoid')(deep_layer)

model = Model(inputs=[sentence_input], \
        outputs=prediction)
model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc'])

Using Theano backend.
