In [2]:
#!/usr/bin/python3

#
#  Copyright 2016-2018 Peter de Vocht
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import spacy
import math
from sentence2vec import Word, Sentence, sentence_to_vec

# use the spacy large model's vectors for testing semantic relatedness
# this assumes you've already installed the large model, if not download it and pip install it:
# wget https://github.com/explosion/spacy-models/releases/tag/en_core_web_lg-2.0.0
# pip install en_core_web_lg-2.0.0.tar.gz
nlp = spacy.load('en_core_web_lg')


# euclidean distance between two vectors
def l2_dist(v1, v2):
    sum = 0.0
    if len(v1) == len(v2):
        for i in range(len(v1)):
            delta = v1[i] - v2[i]
            sum += delta * delta
        return math.sqrt(sum)


if __name__ == '__main__':

    embedding_size = 300   # dimension of spacy word embeddings

    # load some simple sentences for testing similarities between
    sentences = []
    with open('semantic_test_text.txt') as reader:
        for line in reader:
            if len(line.strip()) > 0:
                sentences.append(line.strip().split(' '))

    # convert the above sentences to vectors using spacy's large model vectors
    sentence_list = []
    for sentence in sentences:
        word_list = []
        for word in sentence:
            token = nlp.vocab[word]
            if token.has_vector:  # ignore OOVs
                word_list.append(Word(word, token.vector))
        if len(word_list) > 0:  # did we find any words (not an empty set)
            sentence_list.append(Sentence(word_list))

    # apply single sentence word embedding
    sentence_vector_lookup = dict()
    #print("sentence_list: ",sentence_list)
    #print(type(sentence_list))

    sentence_vectors = sentence_to_vec(sentence_list, embedding_size)  # all vectors converted together
    #print(sentence_vectors)
    if len(sentence_vectors) == len(sentence_list):
        for i in range(len(sentence_vectors)):
            # map: text of the sentence -> vector
            #print(sentence_list[i].__str__())
            #print(sentence_vectors[i])
            sentence_vector_lookup[sentence_list[i].__str__()] = sentence_vectors[i]
"""
    # display similarity between each of the sentences
    sentence_seen = set()
    # go through each sentence and compare it with each other sentence
    for text1, vector1 in sentence_vector_lookup.items():
        for text2, vector2 in sentence_vector_lookup.items():
            if text1 < text2:  # don't repeat combinations already seen
                unique = text1 + ':' + text2
            else:
                unique = text2 + ':' + text1

            if not unique in sentence_seen:
                sentence_seen.add(unique)
                print(text1 + ' :: ' + text2 + ' => distance = ' + str(l2_dist(vector1, vector2)))
"""


"\n    # display similarity between each of the sentences\n    sentence_seen = set()\n    # go through each sentence and compare it with each other sentence\n    for text1, vector1 in sentence_vector_lookup.items():\n        for text2, vector2 in sentence_vector_lookup.items():\n            if text1 < text2:  # don't repeat combinations already seen\n                unique = text1 + ':' + text2\n            else:\n                unique = text2 + ':' + text1\n\n            if not unique in sentence_seen:\n                sentence_seen.add(unique)\n                print(text1 + ' :: ' + text2 + ' => distance = ' + str(l2_dist(vector1, vector2)))\n"

In [None]:

sentences_test = []
with open('test.txt') as reader:
    for line in reader:
        if len(line.strip()) > 0:
            sentences_test.append(line.strip().split(' '))

    # convert the above sentences to vectors using spacy's large model vectors
sentence_list_test = []
for sentence in sentences_test:
    word_list_test = []
    for word in sentence:
        token = nlp.vocab[word]
        if token.has_vector:  # ignore OOVs
            word_list_test.append(Word(word, token.vector))
    if len(word_list_test) > 0:  # did we find any words (not an empty set)
        sentence_list_test.append(Sentence(word_list_test))

# apply single sentence word embedding
sentence_vector_lookup_test = dict()
#print("sentence_list: ",sentence_list_test)
#print(type(sentence_list_test))

In [4]:
sentence_vectors_test = sentence_to_vec(sentence_list_test, embedding_size)  

  explained_variance_ = (S ** 2) / (n_samples - 1)


In [5]:
sentence_vector_lookup_test = dict()

if len(sentence_vectors_test) == len(sentence_list_test):
        for i in range(len(sentence_vectors_test)):
            # map: text of the sentence -> vector
            sentence_vector_lookup_test[sentence_list_test[i].__str__()] = sentence_vectors_test[i]


In [8]:
sentence_seen = set()
result = dict()
    # go through each sentence and compare it with each other sentence
for text1, vector1 in sentence_vector_lookup_test.items():
    for text2, vector2 in sentence_vector_lookup.items():
        if text1 < text2:  # don't repeat combinations already seen
            unique = text1 + ':' + text2
        else:
            unique = text2 + ':' + text1

        if not unique in sentence_seen:
            sentence_seen.add(unique)
            distance = l2_dist(vector1, vector2)
            #print(text1 + ' :: ' + text2 + ' => distance = ' + str(distance))
            result[text2] = distance
            
from collections import OrderedDict
d_sorted_by_value = OrderedDict(sorted(result.items(),  key=lambda x: x[1]))
#d_sorted_by_value
n_items = take(3, d_sorted_by_value.items())
n_items

[('mouse and becomes inoperable from newegg item is refurb day warranty m mouse with key board keyboard works fine just mouse',
  0.565709043966099),
 ('mouse freezes and becomes paired with a keyboard and is refurbish item with a day warranty from newegg',
  1.025416444666237),
 ('mouse up and is inoperable item with a on keyboard works fine a refurb item with warranty',
  1.119718564138407)]

In [7]:
from itertools import islice
 
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))