In [1]:
%load_ext Cython

In [7]:
%%time
%%cython
from gensim.models import doc2vec
import nltk
import re

def split_sentence(sentence):
    words = re.split('\W+', sentence.lower())
    return [word for word in words if word != ""]

class MyDocs(object):
    def __iter__(self):
        for i, text in enumerate(open("revtxt10k.txt")):
            yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])

# Train the doc2vec model
cdef mydocs = MyDocs()
model = doc2vec.Doc2Vec(mydocs, size = 200, window = 8, min_count = 5, workers = 4)
model.save('revtxt10k.model')

CPU times: user 9.41 ms, sys: 3.77 ms, total: 13.2 ms
Wall time: 12.9 ms


In [11]:
model = doc2vec.Doc2Vec.load('revtxt10k.model')
print model.most_similar(positive=["dirty", "bathroom"], negative=["clean"], topn=3)

[('shower', 0.8263002634048462), ('toilet', 0.7940653562545776), ('carpet', 0.7919286489486694)]


In [14]:
import numpy as np
# Calculate the cosine similarity between two vecotrs 
def cossim(v1, v2):
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2))

# Get the specific line of the review text file
def get_line(filename, n):
    with open(filename) as f:
        for i, e in enumerate(f):
            if i == n:
                return e
# Get the index of top n most similar review
def argmaxn(l, n):
    l_copy = list(l)
    args = []
    for i in range(n):
        arg = np.argmax(l_copy)
        args.append(arg)
        l_copy[arg] = -float('inf')
    return args

# Calculate the vector of input text according to our training model
input_text = "good breakfast, clean bathroom"
input_vec = model.infer_vector(split_sentence(input_text))

# Compute the cosine similarity values between the input text and all archived reviews
cossims_with_input = map(lambda v: cossim(input_vec, v), model.docvecs)

# Print the reviews with the highest cosine similarity values
sim_ids = argmaxn(cossims_with_input, 3)
for i in range(3):
    print "(Review-%s, Similarity:%.4f):" % (sim_ids[i] + 1, cossims_with_input[sim_ids[i]]),\
          get_line('revtxt10k.txt', sim_ids[i])

(Review-5371, Similarity:0.4611): 	It had a nice pool. It was cheap. It had free breakfast, but it also had ants--in the bathroom.

(Review-5447, Similarity:0.4607): 	Great hotel not area we needed but for price great. Rooms are very clean. Beds comfortable. Breakfast not good. Would stay there again but not eat breakfast.

(Review-5931, Similarity:0.4566): 	My family and I have been coming since 2003 to this location.We come every summer. It's always clean, the best staff, and the closest location to Disneyland and the nightlife. I enjoy the breakfast as well.



In [15]:
# Construct a data frame with the hotel IDs and document-vectors of all archived reviews
hotel_textvecs = pd.read_csv('hotel_info.txt')
hotel_textvecs['Textvec'] = model.docvecs

# Compute similarity scores between the input text and the hotels (defined here as the cosine similarity between
# the document-vector of the input text and the sum/mean of the document-vectors of all the reviews of a hotel --
# no difference between sum and mean)
hotel_cossims = hotel_textvecs.groupby('HotelID').agg(lambda v: cossim(np.sum(v), input_vec))

# Print the reviews with the highest similarity scores
sim_ids = argmaxn(hotel_cossims['Textvec'], 3)

for i in range(3):
    hotel = hotel_cossims.index[sim_ids[i]]
    print "HotelID is: %s" % hotel, "and the similarity is equal to: %.4f," % hotel_cossims.Textvec[hotel], \
    'also you can find the link here %s' % hotel_textvecs.ix[hotel_textvecs['HotelID']==hotel]['PageUrl'].iloc[0]

HotelID is: 75697 and the similarity is equal to: 0.1238, also you can find the link here http://www.tripadvisor.com/ShowUserReviews-g29092-d75697-r177328255-Park_Vue_Inn-Anaheim_California.html#CHECK_RATES_CONT
HotelID is: 78781 and the similarity is equal to: 0.1131, also you can find the link here http://www.tripadvisor.com/ShowUserReviews-g29092-d78781-r176917523-Ramada_Plaza_Anaheim-Anaheim_California.html#CHECK_RATES_CONT
HotelID is: 75732 and the similarity is equal to: 0.1007, also you can find the link here http://www.tripadvisor.com/ShowUserReviews-g29092-d75732-r177153429-Super_8_Anaheim_Disneyland_Drive-Anaheim_California.html#CHECK_RATES_CONT
