In [1]:
import cPickle as pickle
import time, math
import numpy as np

In [2]:
class ProcessedTitle(object):
    
    def __init__(self, index_title, url, pageView):
        self.index_title = index_title
        self.url = url
        self.pageView = pageView
        title_array = map(ProcessedTitle.reverse_token_dict.get, self.index_title)
        self.title = " ".join(title_array) 
        
    def create_seq2seq_model_embeddings(self, model):
        embedded_input_sets, encode_ouput_sets, hidden_state_sets = model.eval_by_batch([self.index_title])
        self.embeddings = []
        
        # mean_embedded_inputs_, max_embedded_inputs_, min_embedded_inputs_
        for embedding in embedded_input_sets:
            self.embeddings.append(embedding[0])
        # mean_encoder_outputs, max_encoder_outputs, min_encoder_outputs
        for embedding in encode_ouput_sets:
            self.embeddings.append(embedding[0])
        # final_cell_state_, final_hidden_state_
        for embedding in hidden_state_sets:
            self.embeddings.append(embedding[0])
            
def get_embedding_vector(sorted_titles, article_index):
    sample_vector = sorted_titles[article_index].embeddings[6][:]
    sample_vector = np.append(sample_vector, sorted_titles[article_index].embeddings[0][:])    
    return sample_vector

def cosine_similarity(x, y):
    numerator = sum(a*b for a, b in zip(x,y))
    denominator = square_rooted(x) * square_rooted(y)
    return numerator / denominator

def square_rooted(x):
    return math.sqrt(sum([a*a for a in x]))

In [3]:
processed_titles_pickle_file = 'processed_titles_data.pkl'

with open(processed_titles_pickle_file, 'rb') as output_stream:
    sorted_titles = pickle.load(output_stream)

In [4]:
tot_counts = len(sorted_titles)
print 'total {} articles are processed...'.format(tot_counts)

total 200000 articles are processed...


### choose the articles index, between [0, 200000)

In [7]:
index = 135

In [8]:
print '\n', sorted_titles[index].title
print sorted_titles[index].url

embeddings = sorted_titles[index].embeddings
print "embedding size: ", len(embeddings), '\n'
#embeddings[2]


espn announcer makes insensitive deondre francois reference
http://www.msn.com/en-us/sports/ncaafb/espn-announcer-makes-insensitive-deondre-francois-reference/ar-AArAF6O
embedding size:  8 



#### compare the target article with all others

In [10]:
cosine_results, euclidean_results = [], []
start_time = time.time()
article_index = index
print 'URL: ', sorted_titles[article_index].url
print 'title: ', sorted_titles[article_index].title
print 'pageView: ', sorted_titles[article_index].pageView

sample_vector = get_embedding_vector(sorted_titles, article_index)
expected_length = len(sample_vector)
print "\n"

for i in xrange(min(len(sorted_titles), 50000)):
    cur_vector = get_embedding_vector(sorted_titles, i)
    assert len(cur_vector) == expected_length
    
    cosine_result = cosine_similarity(cur_vector, sample_vector)
    sorted_titles[i].cosine_similarity = cosine_result
    cosine_results.append(cosine_result)
print 'all the process takes {:.2f} seconds...'.format(time.time() - start_time)

URL:  http://www.msn.com/en-us/sports/ncaafb/espn-announcer-makes-insensitive-deondre-francois-reference/ar-AArAF6O
title:  espn announcer makes insensitive deondre francois reference
pageView:  350269


all the process takes 17.17 seconds...


#### show articlse ranked by the similarity

In [11]:
cosine_threshold = 0.85
euclidean_threshold = 4

total_counts = 0
relevant_titles = []
for i in xrange(len(cosine_results)):
    if sorted_titles[i].cosine_similarity > cosine_threshold:
        total_counts += 1
        relevant_titles.append(sorted_titles[i])

# sort the titles by the `cosine_similarity`
selected_titles = sorted(relevant_titles, key=lambda elem: elem.cosine_similarity, reverse=True)

for title in selected_titles:
    print title.cosine_similarity
    print title.url
    print title.title
    print "\n"

1.0
http://www.msn.com/en-us/sports/ncaafb/espn-announcer-makes-insensitive-deondre-francois-reference/ar-AArAF6O
espn announcer makes insensitive deondre francois reference


0.858626555105
http://floridastate.247sports.com/ContentGallery/Media-reacts-to-FSU-QB-Deondre-Francois-injury-106872662/2
media reacts to fsu qb deondre francois injury


