In [1]:
import webhoseio, os
from gensim.models import KeyedVectors
import json
from simhash import Simhash, SimhashIndex
import numpy as np

In [2]:
webhoseio.config(token=os.environ['WEBHOSE_TOKEN'])
query_params = {
    "q": "organization:Tesla",
    "ts": "1523748602856",
    "sort": "crawled"
}

### Load Word Vector Model

In [3]:
model_path = '/Users/javidbeck/projects/data/model/'
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_w2v_AP    = load_wordvec_model('Word2Vec Google News', 'GoogleNews-vectors-negative300.bin', True)
#model_fasttext = load_wordvec_model('FastText', 'fastText_wiki_en.vec', False)

Loading Word2Vec Google News model...
Finished loading Word2Vec Google News model...


In [4]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [5]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [6]:
def cleanup(input):
    # remove English stopwords
    input = input.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
    input = re.sub(r'[^a-zA-Z0-9 ]', '', input)
    return input

### Query Webhose for a set of 100 posts/feeds (be careful how many times you make this call)

In [7]:
# be careful how many times you make this call
# get the first batch
output = webhoseio.query("filterWebContent", query_params)

### Print original dataset titles only

In [23]:
feeds = []
i = 0
for feed in output['posts']:
    feed['id'] = i
    print(feed['id'], str(feed['title']))
    i += 1
    feeds.append(feed)

0 Why Tesla, Apogee Enterprises, and General Mills Slumped Today
1 Both sides are smoking dope
2 Tesla shares fall after report of criminal investigation
3 Tesla faces Justice Department criminal investigation over Elon Musk's tweets | kiiitv.com
4 Tesla under investigation by Justice Department over CEO Elon Musk's comments about taking company private
5 Audi unveils the eTron with an eye on Tesla- The New Indian Express
6 Tesla shares fall after report of criminal investigation
7 Tesla confirms criminal probe into Musk talk of going private
8 Tesla shares fall after report of criminal investigation
9 The DOJ's inquiry into Tesla has 'raised the temperature' on the company, an SEC lawyer says (TSLA)
10 Elon Musk brother Kimbal Musk delivers Tesla Model 3
11 British cave diver sues Elon Musk over 'pedo' comments
12 Tesla shares fall after report of criminal investigation | KRXI
13 Tesla Faces Justice Department Investigation Over Musk's Tweets
14 Saudi Arabia invests $1B to build elect

### Define distance, index using SimHash, calculate duplicates subset using SimHash only

In [38]:
distance = 25
objs = [(str(feed['id']), Simhash(str(feed['title']))) for feed in feeds]
index = SimhashIndex(objs, k=distance)

In [39]:
# Select a feed from the list based on its index
SELECTED_INDEX = 44 #27
feed_sel = feeds[SELECTED_INDEX]
print(feed_sel['title'])
# calculate  hash value
feed_hash = Simhash(str(feed_sel['title']))
# find all duplicate indices
dup_indices = index.get_near_dups(feed_hash)
print("Number of duplicates (SimHash): " + str(len(dup_indices)))

Tesla Is Under Investigation By The Justice Department
Number of duplicates (SimHash): 36


### Print SimHash-based duplicate titles

In [40]:
for dupi in dup_indices:
    print(feeds[int(dupi)]['title'])

Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla under investigation by Justice Department over CEO Elon Musk's comments about taking company private
Tesla confirms it is under probe by US Justice Department
Tesla confirms it is under probe by US Justice Department
Tesla confirms it is under probe by US Justice Department
Tesla is cooperating with the DOJ investigation
Tesla Is Under Investigation By The Justice Department
Tesla confirms it is under probe by US Justice Department
Tesla confirms it is under probe by US Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla confirms criminal probe into Musk talk of going private
Tesla is facing a criminal investigation by the U.S. Justice
Tesla Is Under Investigation By The Justice Department
Tesla to be investigated by US Justice Department over Elon Musk tweets | The Independent
Tesla Faces Justice Department Investigation Over Musk's Tweets
Elo

### Calculate and print SimHash + Word2Vec similarity based duplicate titles

In [41]:
count_dup = 0
for dupi in dup_indices:
    try:
        score = calc_similarity(feed_sel['title'], feeds[int(dupi)]['title'], model_w2v_AP)
    except:
        score = 0
    if score > 0.9:
        count_dup += 1
        print(feeds[int(dupi)]['title'])
print("Number of Dupies (SimHash + WordVectors): " + str(count_dup))

Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Tesla Is Under Investigation By The Justice Department
Number of Dupies (SimHash + WordVectors): 13
