In [18]:
import pandas as pd
from gensim.models import Word2Vec
from termcolor import colored
import warnings
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")

In [2]:
#Using the Google News Model
#not pushed because it is too big but can be found here: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
query_number = 0

In [3]:
#I will be using the first folder as my data
queries_file = open("data/user_studies/all.txt", "r")
lines = queries_file.readlines()
lines = [l.strip() for l in lines]
# remove all strings with length < 3
queries = []
for i in lines:
    if not len(i)<3:
        queries.append(i)

In [10]:
#helper functions
def get_query_vector(q):
    """returns the vector for sentence"""
    #remove all punctuation from query
    q = re.sub(r'[^\w\s]','',q)
    #split by word
    q = q.split(" ")
    # remove extra spaces
    q = [i.strip() for i in q]
    query_vec = np.zeros(300)
    #adds the vectors of all individual words to get
    for w in q:
        try:
            query_vec+=model[w]
        except KeyError:
            pass
    return query_vec

#Get results based on query number
def get_results(q_num):
    filename = "./data/user_studies/results_"+str(q_num)+".txt"
    f = open(filename, "r")
    lst = f.readlines()
    lst = [i.split("--") for i in lst]
    lst = list(zip(*lst))
    scores = [float(i.strip()) for i in lst[0]]
    results = [i.strip() for i in lst[1]]
    return scores, results

#Underlines words
def underline_word(word):
    return colored(word, "yellow")

def similarity(vec1, vec2):
    return cosine_similarity(vec1, vec2)[0][0]

def max_similarity_word(q_vec, result):
    max_word = ''
    max_similarity = 0
    #remove punctuation from result
    result = re.sub(r'[^\w\s]','',result)
    result = result.split(" ")
    result = [i.strip() for i in result]
    for word in result:
        try:
            s = similarity(q_vec, model[word])
            if s > max_similarity:
                max_similarity = s
                max_word = word
        except KeyError:
            pass
    return max_word

**Visualization 1:** Finding the word that is most similar in a sentence

In [11]:
query = queries[query_number]
scores, results = get_results(query_number)
q_vec = get_query_vector(query)

In [17]:
for score,sentence in zip(scores, results):
    similar_word = max_similarity_word(q_vec, sentence)
    my_regex = r"\b" + re.escape(similar_word) + r"\b"
    sentence = re.sub(my_regex, underline_word(similar_word), sentence)
    print(str(score)+"--"+sentence)

0.847--played outside with elijah , [33mneighbor[0m dog , [33mneighbor[0m sheep , and [33mneighbor[0m kids .
0.696--that includes a [33mneighbor[0m , a friend , a mother , father , sibling , spouse , lover .
0.696--if your friend , girlfriend , roommate , sister , mom , cousin , aunt , [33mneighbor[0m , aquaintance , etc .
0.694--stepdad : : : is holding the puppy , when the [33mneighbor[0m comes out : : [33mneighbor[0m : you !
0.687--he was a good father , husband , teacher , [33mneighbor[0m , and friend .
0.686--he was a [33mneighbor[0m , a collegue , and a very close family friend .
0.684--: pm go to [33mneighbor[0m s house to teach [33mneighbor[0m s niece ... call center articulation .
0.684--i was shuffled from [33mneighbor[0m to [33mneighbor[0m to my aunt and uncle s house and back again .
0.682--my mom , our next door [33mneighbor[0m , and our next door [33mneighbor[0m s wife .
0.679--she was the former nextdoor [33mneighbor[0m of a mother of a fr