In [43]:
import json_lines
import _pickle as pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

Using TensorFlow backend.


In [48]:
titles = []
contents = []
with open('sample-1M.jsonl', 'rb') as f:
    for item in json_lines.reader(f):
        titles.append(item['title'].lower())
        contents.append(item['content'].lower())

In [None]:
with open('./data/tokens.pkl', 'wb') as fp:
    pickle.dump((titles, contents), fp, 2)

In [None]:
# with open('data/tokens.pkl', 'rb') as fp:
#     titles, contents = pickle.load(fp)

In [49]:
titles[1]

'jumpshot gives marketers renewed visibility into paid and organic keywords with launch of jumpshot elite'

In [13]:
contents[1]

'New Product Gives Marketers Access to Real Keywords, Conversions and Results Along With 13 Months of Historical Data \n\nSAN FRANCISCO, CA -- (Marketwired) -- 09/17/15 -- Jumpshot, a marketing analytics company that uses distinctive data sources to paint a complete picture of the online customer journey, today announced the launch of Jumpshot Elite, giving marketers insight into what their customers are doing the 99% of the time they\'re not on your site. For years, marketers have been unable to see what organic and paid search terms users were entering, much less tie those searches to purchases. Jumpshot not only injects that user search visibility back into the market, but also makes it possible to tie those keywords to conversions -- for any web site. \n\n"Ever since search engines encrypted search results, marketers have been in the dark about keywords, impacting not only the insight into their own search investments, but also their ability to unearth high converting keywords for 

In [15]:
len(contents)

1000000

In [59]:
from collections import Counter
def get_vocab(lst):
    vocabcount = Counter(w for txt in lst for w in txt.split())
    vocab = list(map(lambda x: x[0], sorted(vocabcount.items(), key = lambda x: x[1], reverse=True)))
    return vocab, vocabcount

In [60]:
vocab, vocabcount = get_vocab(titles+contents)

## Indexing words

In [62]:
empty = 0
eos = 1
start_idx = eos + 1

In [63]:
word2idx = {word: idx+start_idx for idx, word in enumerate(vocab)}
word2idx['<empty>'] = empty
word2idx['<eos>'] = eos
idx2word = {idx: word for word, idx in word2idx.items()}

In [64]:
len(word2idx)

4900336

## Word Embedding


### Glove 

In [67]:
glove_name = './data/glove.6B.100d.txt'
embedding_dim = 100

In [66]:
glove_n_symbols = sum(1 for line in open(glove_name))
print()
print('{:,} GloVe symbols'.format(glove_n_symbols))


400,000 GloVe symbols


In [68]:
glove_index_dict = {}
glove_embedding_weights = np.empty((glove_n_symbols, embedding_dim))

In [69]:
globale_scale = .1

In [70]:
with open(glove_name, 'r') as fp:
    i = 0
    for l in fp:
        l = l.strip().split()
        w = l[0]
        glove_index_dict[w] = i
        glove_embedding_weights[i, :] = list(map(float, l[1:]))
        i += 1
glove_embedding_weights *= globale_scale

In [71]:
print('GloVe std dev: {:.4f}'.format(glove_embedding_weights.std()))

GloVe std dev: 0.0408


In [72]:
for w, i in glove_index_dict.items():
    w = w.lower()
    if w not in glove_index_dict:
        glove_index_dict[w] = i

### construct embedding matrix

In [77]:
# generate random embedding with same scale as glove
np.random.seed()
shape = (vocab_size, embedding_dim)
scale = glove_embedding_weights.std() * np.sqrt(12) / 2  # uniform and not normal
embedding = np.random.uniform(low=-scale, high=scale, size=shape)
print('random-embedding/glove scale: {:.4f} std: {:.4f}'.format(scale, embedding.std()))

random-embedding/glove scale: 0.0707 std: 0.0408


In [78]:
# copy from glove weights of words that appear in our short vocabulary
c = 0
for i in range(vocab_size):
    w = idx2word[i]
    g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))
    if g is None and w.startswith('#'):  # glove has no hastags (I think...)
        w = w[1:]
        g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))
    if g is not None:
        embedding[i, :] = glove_embedding_weights[g, :]
        c += 1
print('number of tokens, in small vocab: {:,} found in glove and copied to embedding: {:.4f}'.format(c, c / float(vocab_size)))

number of tokens, in small vocab: 25,982 found in glove and copied to embedding: 0.6495


In [79]:
glove_thr = 0.5


# In[25]:

word2glove = {}
for w in word2idx:
    if w in glove_index_dict:
        g = w
    elif w.lower() in glove_index_dict:
        g = w.lower()
    elif w.startswith('#') and w[1:] in glove_index_dict:
        g = w[1:]
    elif w.startswith('#') and w[1:].lower() in glove_index_dict:
        g = w[1:].lower()
    else:
        continue
    word2glove[w] = g


# for every word outside the embedding matrix find the closest word inside the mebedding matrix.
# Use cos distance of GloVe vectors.
# Allow for the last `nb_unknown_words` words inside the embedding matrix to be considered to be outside.
# Dont accept distances below `glove_thr`

# In[26]:

normed_embedding = embedding / np.array(
    [np.sqrt(np.dot(gweight, gweight)) for gweight in embedding])[:, None]

nb_unknown_words = 100

glove_match = []
for w, idx in word2idx.items():
    if idx >= vocab_size - nb_unknown_words and w.isalpha() and w in word2glove:
        gidx = glove_index_dict[word2glove[w]]
        gweight = glove_embedding_weights[gidx, :].copy()
        # find row in embedding that has the highest cos score with gweight
        gweight /= np.sqrt(np.dot(gweight, gweight))
        score = np.dot(normed_embedding[:vocab_size - nb_unknown_words], gweight)
        while True:
            embedding_idx = score.argmax()
            s = score[embedding_idx]
            if s < glove_thr:
                break
            if idx2word[embedding_idx] in word2glove:
                glove_match.append((w, embedding_idx, s))
                break
            score[embedding_idx] = -1
glove_match.sort(key=lambda x: -x[2])
print()
print('# of GloVe substitutes found: {:,}'.format(len(glove_match)))



# of GloVe substitutes found: 145,438


In [80]:
normed_embedding.shape

(40000, 100)

In [81]:
# manually check that the worst substitutions we are going to do are good enough
for orig, sub, score in glove_match[-10:]:
    print('{:.4f}'.format(score), orig, '=>', idx2word[sub])

0.5000 zyrtec => immunotherapy
0.5000 colonizing => migrating
0.5000 cheapskate => dude
0.5000 reoffend => 2014/15
0.5000 toadfish => braf
0.5000 maia => joanna
0.5000 jammy => fruity
0.5000 neurogenetics => mhealth
0.5000 capsa => astra
0.5000 tesa => fitbit


In [86]:
FN = 'vocabulary-embedding'
# build a lookup table of index of outside words to index of inside words
glove_idx2idx = dict((word2idx[w], embedding_idx) for w, embedding_idx, _ in glove_match)

with open('./data/{}.pkl'.format(FN), 'wb') as fp:
    pickle.dump((embedding, idx2word, word2idx, glove_idx2idx), fp, 2)

# Data
Y = [[word2idx[token] for token in headline.split()] for headline in titles]

X = [[word2idx[token] for token in d.split()] for d in contents]

with open('./data/{}.data.pkl'.format(FN), 'wb') as fp:
    pickle.dump((X, Y), fp, 2)