# Visualize documents and their differences

In [18]:
import ipykernel
ipykernel.get_connection_file()

'/run/user/1000/jupyter/kernel-57ac752b-6d1a-4975-881f-26db45a2b774.json'

In [None]:
foo = "string"

## Utility functions

In [None]:
from sklearn.manifold import TSNE
import gensim
from loadCorpus import loadModel
from random import randint
import matplotlib

def drawEmbeddingSamples(num=1000, model=None):
    if not isinstance(model, gensim.models.doc2vec.Doc2Vec):
        model = loadModel(dim=600)
    doctags = list(model.docvecs.doctags)
    randomSamples = [doctags.pop(randint(0, len(doctags)-(spok + 1))) for spok in range(num)]
    return {x: model.docvecs[x] for x in randomSamples}

def tsne(docs, metric='euclidean'):
    # calulate the t-SNE representation
    tsne = TSNE(n_components=2, random_state=0, metric=metric)
    return tsne.fit_transform(docs)

def randomColors(N=5, candidates=list(matplotlib.colors.cnames.keys())):
    return [candidates[randint(0, len(candidates)-1)] for _ in range(N)]

def myCosine(X, Y=None, metric='cosine', n_jobs=1, **kwds):
    import pdb; pdb.set_trace()
    bar = pairwise_distances(X, Y=Y, metric=metric, n_jobs=n_jobs, **kwds)
    bar[bar < 0] = 0
    return bar

## Load the model and pick the desired documents

In [None]:
from loadCorpus import loadModel
model = loadModel(dim=600)
docvecs = model.docvecs
#X,y = loadCorpus(dim=100, model=model)

fuest2007 = docvecs['550246665.pdf']
fuest1999 =  docvecs['cesifo_wp215.pdf']
sinn2007 = docvecs['555918033.PDF']
sinn2000 = docvecs['cesifo_wp307.pdf']

### Fuest's difference +/- Sinn's documents

In [None]:
from scipy.spatial.distance import cosine

numSamples = 5000

fuestDiff = fuest1999 - fuest2007
# snm -> Sinn, New, Minus fuestDiff
# som -> Sinn, Old, Minus fuestDiff
# ...
snm = sinn2007 - fuestDiff
som = sinn2000 - fuestDiff
snp = sinn2007 + fuestDiff
sop = sinn2000 + fuestDiff

docVecsSmF = [snm, som, snp, sop, fuestDiff, fuest2007, fuest1999, \
                 sinn2007, sinn2000]
docVecsSmFMapping = ['snm', 'som', 'snp', 'sop', 'fuest2007 - fuest1999', \
                        'fuest2007', 'fuest1999', 'sinn2007', 'sinn2000']
docVecsSmFColors = ['red'] * 4 + ['yellow', 
                                     '#00ccff', # light blue
                                     '#0000cc', # blue
                                     '#66ff33', # light green
                                     '#39ac73'] # green

randomSamples = list(drawEmbeddingSamples(numSamples, model=model).values())

docs = docVecsSmF + randomSamples
docColors = docVecsSmFColors + ['black'] * numSamples

In [None]:
tsneDocs = tsne(docs)

In [None]:
# t-SNE
%pylab inline
import matplotlib.pyplot as plt

tsneIfo = tsneDocs[:len(docVecsSmF), :]
tsneSamples = tsneDocs[len(docVecsSmF):, :]

handles = []
plt.figure(figsize=(15, 12), dpi=100)
for docVecs, c in zip(tsneIfo, docColors):
    handles.append(plt.scatter(docVecs[0], docVecs[1], s=250,\
                   c=c, alpha=.5, cmap=plt.cm.Spectral))

handles.append(plt.scatter(tsneSamples[:, 0], tsneSamples[:, 1], s=50, c='black',\
                           alpha=.05, cmap=plt.cm.Spectral))
plt.title("t-SNE")
plt.legend(handles, docVecsSmFMapping)

plt.show()

### Sinn's difference +/- Fuest's documents

In [None]:
sinnDiff = sinn2000 - sinn2007
fnm = fuest2007 - sinnDiff
fom = fuest1999 - sinnDiff
fnp = fuest2007 + sinnDiff
fop = fuest1999 + sinnDiff

docVecsFmS = [fnm, fom, fnp, fop, fuestDiff, fuest2007, fuest1999, \
                sinn2007, sinn2000]
docVecsFmSMapping = ['fnm', 'fom', 'fnp', 'fop', 'sinn2000 - sinn2007', \
                        'fuest2007', 'fuest1999', 'sinn2007', 'sinn2000']
docVecsFmSColors = ['red'] * 4 + ['yellow', 
                                     '#00ccff', # light blue
                                     '#0000cc', # blue
                                     '#66ff33', # light green
                                     '#39ac73'] # green

randomSamples = list(drawEmbeddingSamples(numSamples, model=model).values())

docs = docVecsFmS + randomSamples
docColors = docVecsFmSColors + ['black'] * numSamples

In [None]:
tsneDocs = tsne(docs)

In [None]:
# t-SNE
%pylab inline
import matplotlib.pyplot as plt

tsneIfo = tsneDocs[:len(docVecsFmS), :]
tsneSamples = tsneDocs[len(docVecsFmS):, :]

handles = []
plt.figure(figsize=(15, 12), dpi=100)
for docVecs, c in zip(tsneIfo, docColors):
    handles.append(plt.scatter(docVecs[0], docVecs[1], s=250,\
                   c=c, alpha=.5, cmap=plt.cm.Spectral))

handles.append(plt.scatter(tsneSamples[:, 0], tsneSamples[:, 1], s=50, c='black',\
                           alpha=.05, cmap=plt.cm.Spectral))
plt.title("t-SNE")
plt.legend(handles, docVecsFmSMapping)

plt.show()

# t-SNE by authors

In [None]:
topUSAuthors = {
    'James J. Heckman': {
        'color': 'red',
        'tsne': [],
        'docs': ["791344649.pdf.json",
            "wp03-04.pdf.json",
            "664338003.pdf.json",
            "dp8827.pdf.json",
            "728401312.pdf.json",
            "dp8027.pdf.json",
            "cesifo_wp1031.pdf.json",
            "dp8711.pdf.json",
            "dp7750.pdf.json",
            "732416388.pdf.json",
            "558854818.pdf.json",
            "wp03-13.pdf.json",
            "632184426.pdf.json",
            "632185252.pdf.json",
            "490361323.pdf.json",
            "cesifo_wp1014.pdf.json",
            "dp7550.pdf.json",
            "732553164.pdf.json",
            "66523810X.pdf.json",
            "362526656.pdf.json",
            "374816565.pdf.json",
            "dp7628.pdf.json",
            "490471781.pdf.json",
            "687928893.pdf.json",
            "484623192.pdf.json",
            "801006074.pdf.json",
            "717449475.pdf.json",
            "dp8200.pdf.json",
            "675944805.pdf.json",
            "dp9476.pdf.json",
            "wp02-02.pdf.json",
            "wp03-09.pdf.json",
            "801004721.pdf.json",
            "715979043.pdf.json",
            "665267223.pdf.json",
            "749691735.pdf.json",
            "632183454.pdf.json",
            "idb-wp_430.pdf.json",
            "716017407.pdf.json",
            "dp7552.pdf.json",
            "wp03-17.pdf.json",
            "pp17.pdf.json",
            "dp9247.pdf.json",
            "558856314.pdf.json",
            "727557017.pdf.json",
            "dp8548.pdf.json",
            "757433537.pdf.json",
            "dp8424.pdf.json",
            "659505134.pdf.json",
            "dp8338.pdf.json",
            "dp8696.pdf.json",
            "664348866.pdf.json",
            "663112346.pdf.json",
            "wp1408.pdf.json",
            "607538317.pdf.json",
            "362626855.pdf.json",
            "638495555.pdf.json"]
    },
    'Barry Julian Eichengreen': {
        'color': 'fuchsia',
        'tsne': [],
        'docs': ["730571912.pdf.json",
            "wp-004.pdf.json",
            "idb-wp_558.pdf.json",
            "360795722.pdf.json",
            "510276172.pdf.json",
            "796867208.pdf.json",
            "77656370X.pdf.json",
            "ewp-262.pdf.json",
            "656407387.pdf.json",
            "590225650.PDF.json",
            "61801697X.pdf.json",
            "642338310.pdf.json",
            "729180689.pdf.json"],
    },
    'Daron Acemoglu': {
        'color': 'gold',
        'tsne': [],
        'docs': ["477687644.pdf.json",
            "686844475.pdf.json",
            "833124862.pdf.json",
            "dp9068.pdf.json",
            "612963969.pdf.json",
            "567037134.pdf.json",
            "dp7906.pdf.json",
            "cesifo_wp5366.pdf.json",
            "VfS_2010_pid_558.pdf.json"]
    },
    'Joseph E. Stiglitz': {
        'color': 'sienna',
        'tsne': [],
        'docs': ["640462014.pdf.json",
            "576782793.pdf.json",
            "309202949.pdf.json",
            "51214043X.pdf.json",
            "771928769.pdf.json",
            "826742238.pdf.json"]
    },
    'Christopher F Baum': {
        'color': 'blue',
        'tsne': [],
        'docs': ["dp633.pdf.json",
            "dp634.pdf.json",
            "dp638.pdf.json",
            "dp410.pdf.json",
            "623004666.pdf.json",
            "772388652.pdf.json",
            "dp635.pdf.json",
            "dp443.pdf.json",
            "dp0410.pdf.json",
            "diw_finess_03030.pdf.json",
            "595251412.PDF.json",
            "606801979.pdf.json"]
    },
    'Carmen M. Reinhart': {
        'color': 'darkgoldenrod',
        'tsne': [],
        'docs': ["cesifo_wp5422.pdf.json",
            "idb-wp_457.pdf.json",
            "idb-wp_458.pdf.json",
            "732720230.pdf.json",
            "lmu-mdp_2014-49.pdf.json",
            "idb-wp_302.pdf.json",
            "687820979.pdf.json"]
    },
    'Thomas J. Sargent': {
        'color': 'white',
        'tsne': [],
        'docs': ["wp2003-14.pdf.json",
            "200528dkp.pdf.json",
            "wp2003-25.pdf.json",
            "383913152.PDF.json",
            "505119463.pdf.json",
            "wp481.pdf.json",
            "505119412.pdf.json",
            "wp2005-09.pdf.json",
            "82835975X.pdf.json",
            "wp2004-22.pdf.json",
            "591928027.pdf.json",
            "572292899.pdf.json"]
    },
    'M Hashem Pesaran': {
        'color': 'green',
        'tsne': [],
        'docs': ["736674640.pdf.json",
            "cesifo1_wp1599.pdf.json",
            "734622074.pdf.json",
            "cesifo_wp5428.pdf.json",
            "660761904.pdf.json",
            "666546231.pdf.json",
            "715801236.pdf.json",
            "59283526X.PDF.json",
            "cesifo_wp4508.pdf.json",
            "cesifo_wp995.pdf.json",
            "559866755.pdf.json",
            "604523742.pdf.json",
            "559459076.pdf.json",
            "555510999.pdf.json",
            "cesifo_wp4834.pdf.json",
            "720705274.pdf.json",
            "515328278.PDF.json",
            "538177373.PDF.json",
            "66205587X.pdf.json",
            "517049996.PDF.json",
            "568422740.PDF.json",
            "200627dkp.pdf.json",
            "cesifo_wp4371.pdf.json",
            "720581133.pdf.json",
            "cesifo_wp5434.pdf.json",
            "cesifo_wp990.pdf.json",
            "dp1196.pdf.json",
            "cesifo1_wp1425.pdf.json",
            "727117904.pdf.json",
            "548147493.pdf.json",
            "564825875.PDF.json",
            "cesifo_wp869.pdf.json",
            "IDB-WP-510.pdf.json",
            "551460059.pdf.json",
            "68520281X.pdf.json",
            "cesifo1_wp1650.pdf.json",
            "538347554.PDF.json",
            "715937049.pdf.json",
            "cesifo1_wp1477.pdf.json",
            "51578544X.pdf.json",
            "715931490.pdf.json",
            "555968669.PDF.json",
            "cesifo_wp4592.pdf.json",
            "544121406.pdf.json",
            "55799005X.PDF.json",
            "717914593.pdf.json",
            "627338070.pdf.json",
            "559866593.pdf.json",
            "cesifo1_wp1416.pdf.json",
            "economics_2007-3.pdf.json",
            "dp1108.pdf.json",
            "617482330.pdf.json",
            "665579225.pdf.json",
            "551074620.pdf.json",
            "532017501.pdf.json",
            "715366505.pdf.json",
            "619071087.pdf.json",
            "cesifo1_wp1233.pdf.json",
            "570161258.PDF.json",
            "732540674.pdf.json",
            "dp1236.pdf.json",
            "cesifo1_wp1438.pdf.json",
            "626619505.pdf.json",
            "200542dkp.pdf.json",
            "517025035.PDF.json",
            "cesifo1_wp1548.pdf.json",
            "538034203.PDF.json",
            "cesifo1_wp1565.pdf.json",
            "cesifo1_wp1237.pdf.json",
            "cesifo_wp1176.pdf.json",
            "661602680.pdf.json",
            "514746963.pdf.json",
            "560540418.PDF.json",
            "529380676.PDF.json",
            "cesifo1_wp1531.pdf.json",
            "715726412.pdf.json",
            "cesifo_wp5410.pdf.json",
            "615085288.pdf.json",
            "cesifo1_wp1358.pdf.json",
            "71745228X.pdf.json",
            "cesifo1_wp1659.pdf.json",
            "cesifo_wp4433.pdf.json",
            "518893278.PDF.json",
            "cesifo1_wp1331.pdf.json",
            "52873816X.PDF.json",
            "cesifo_wp4983.pdf.json",
            "559087853.PDF.json",
            "538378069.pdf.json",
            "cesifo_wp4871.pdf.json",
            "587537787.pdf.json",
            "cesifo_wp4736.pdf.json",
            "cesifo_wp4822.pdf.json",
            "612935043.pdf.json",
            "cesifo_wp5367.pdf.json",
            "631014306.pdf.json",
            "559090684.PDF.json",
            "62965543X.pdf.json",
            "cesifo_wp1169.pdf.json",
            "cesifo_wp4232.pdf.json",
            "kap1366.pdf.json",
            "cesifo1_wp1229.pdf.json",
            "516953184.PDF.json",
            "dp1240.pdf.json",
            "690002343.pdf.json",
            "631381295.pdf.json",
            "685277747.pdf.json",
            "557255945.PDF.json",
            "615344860.pdf.json",
            "538294833.PDF.json",
            "cesifo_wp374.pdf.json",
            "548147248.pdf.json",
            "cesifo_wp4807.pdf.json",
            "659395266.pdf.json",
            "dp1313.pdf.json",
            "669983144.pdf.json",
            "669863831.pdf.json",
            "dp2007-7.pdf.json",
            "cesifo1_wp1308.pdf.json",
            "558342329.PDF.json"]
    }    
}

In [None]:
import itertools
from loadCorpus import loadModel

try:
    if not isinstance(model, gensim.models.doc2vec.Doc2Vec):
        model = loadModel(dim=600)
except NameError:
    model = loadModel(dim=600)

corpus = list(drawEmbeddingSamples(num=9000, model=model).values())

# flatten the datastructure to obtain all documents' file names
topUSAuthorsDocs = itertools.chain(*(author['docs'] \
                                     for author in list(topUSAuthors.values())))
topUSAuthorsDocs = list(map(lambda x: x.replace(".json", ""), topUSAuthorsDocs))
topUSAuthorsDocEmbeddings = [docvecs[doc] for doc in topUSAuthorsDocs]

# merging samples and the selected docs
corpus = list(itertools.chain(topUSAuthorsDocEmbeddings, corpus))

In [None]:
tsneCorpus = tsne(corpus)

In [None]:
# plot the result
%pylab inline
import matplotlib.pyplot as plt
import numpy as np

tsneUSAuthors = tsneCorpus[:len(topUSAuthorsDocs), :]
tsneSamples = tsneCorpus[len(topUSAuthorsDocs):, :]
preferedColors = ['green', 'mediumorchid', 'darkgoldenrod', 'blue', \
                  'sienna', 'gold', 'fuchsia', 'red', 'white']
# asign color to author and
# asign tsne representations to documents
docsProcessed = 0
for author, values in topUSAuthors.items():
    #values['color'] = randomColors(N=1, candidates=preferedColors)
    
    numDocs = len(values['docs'])
    values['tsne'] = tsneUSAuthors[docsProcessed:(docsProcessed+numDocs)]
    docsProcessed += numDocs
    
handles = []
plt.figure(figsize=(20, 20), dpi=100)
for author, values in topUSAuthors.items():
    handles.append(plt.scatter(values['tsne'][:, 0], values['tsne'][:, 1], s=250,\
                   c=values['color'], alpha=.5, cmap=plt.cm.Spectral))

handles.append(plt.scatter(tsneSamples[:, 0], tsneSamples[:, 1], s=50, c='black',\
                           alpha=.05, cmap=plt.cm.Spectral))
plt.title("t-SNE")
plt.legend(handles, [author for author in topUSAuthors.keys()])


plt.show()

## Notes

## cluster @-8,.5
### statistic models
M Hashem Pesaran -> cesifo1_wp1599.pdf -> models of expectations formation, survey data, heterogeneity, tests of rational expectations
M Hashem Pesaran -> 734622074.pdf -> dynamic discrete choice, fixed effects, panel data, initial values, GMM, CMLE
James J. Heckman -> 632185252.pdf -> Correlated random coefficient, testing, instrumental variables, power of
tests based on IV

## cluster @0,3
### risk management/natural resources/oil and gas
M Hashem Pesaran -> cesifo_wp995.pdf -> risk management, economic interlinkages, loss forecasting, default correlation 
M Hashem Pesaran -> dp2007-7.pdf -> Global VAR, interdependencies, Fisher relationship, Uncovered Interest Rate Parity, Purchasing Power Parity, persistence profile, error variance decomposition
M Hashem Pesaran -> 690002343.pdf -> growth models, long run and error correcting relations, major oil exporters, OPEC member countries, oil exports and foreign output shocks.

## cluster @6.5,-4
### financial risk/ currencies
Barry Julian Eichengreen -> 729180689.pdf -> Tackling systemic financial risk
Barry Julian Eichengreen -> 642338310.pdf -> The Federal Reserve, the Bank of England, and the Rise of the Dollar as an International Currency, 1914-1939

# Doc2Vec arithmic

In [None]:
def indicesToDoctags(indices, model=None):
    if model == None:
        raise TypeError('A model is required')
    return {i: model.docvecs.index_to_doctag(i) for i in list(indices)}

In [None]:
from scipy.spatial.distance import cosine
from sklearn.neighbors import NearestNeighbors

try:
    model
except NameError:
    model = loadModel(dim=600)

nbrs = NearestNeighbors(n_neighbors=10)
nbrs.fit(list(docvecs))
    
X = docvecs['68320193X.pdf']
Y = docvecs['525031359.pdf']

Z1 = X - Y
Z2 = X + Y

disZ1, indZ1 = nbrs.kneighbors(Z1.reshape(1,-1))
disZ2, indZ2 = nbrs.kneighbors(Z2.reshape(1,-1))

In [None]:
foo = set(list(indZ1[0]))
bar = set(list(indZ2[0]))

list(indicesToDoctags(indZ1.flat, model=model))

# pick random docs and compute distances

In [None]:
from random import randrange
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cosine

sampleSize = 1000

randRange = [randrange(0, len(docvecs)) for _ in range(sampleSize)]
randDocs = docvecs[randRange]
distances = [[cosine(i,j) for i in randDocs] for j in randDocs]
distances = np.array(distances)

# Find two close and one distant doc

In [None]:
# far distance docs
X, A = np.unravel_index(distances.argmax(), distances.shape)

# nearby docs
nbrs = NearestNeighbors(n_neighbors=2)
nbrs.fit(list(randDocs))
_, indices = nbrs.kneighbors(randDocs[X].reshape(1,-1))
Y = indices[0][1]


X = (randRange[X], docvecs.index_to_doctag(randRange[X]))
Y = (randRange[Y], docvecs.index_to_doctag(randRange[Y]))
A = (randRange[A], docvecs.index_to_doctag(randRange[A]))

# Explain difference between X and Y using wordvectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def getSimilarities(a, model=None):
    if model == None:
        raise TypeError('You need to pass a model')
    
    # TODO: Can I get rid of the for loop?
    for word in model.vocab.keys():
        yield word, cosine_similarity(a.reshape(1,-1), model[word].reshape(1,-1)).flatten()[0]

        
def findClosestWordVec(X, Y, model=None):
    '''
    Find the wordvec wv that minimizes distance(X+wv, Y)
    '''
    if model == None:
        raise TypeError('You need to pass a model')

    diff = Y - X
    sims = []
    for sim in tqdm(getSimilarities(diff, model=model)):
        sims.append(sim)
    sims = sorted(sims, key=lambda x: -x[1])
    return sims        

In [None]:
from scipy.spatial.distance import cosine
from tqdm import tqdm
import logging

logging.getLogger().setLevel(logging.INFO)

X = docvecs[X[0]]
Y = docvecs[Y[0]]
newX = X
path = [] # keeps track of the vectors that make up the difference

logging.info('initial distance is: {}'.format(cosine(X, Y)))
for i in tqdm(range(20)):
    candidates = findClosestWordVec(newX, Y, model=model)
    
    # is the performance better now?
    # if not, check the next candidate
    secondChances = 10
    newWord = ""
    for j in range(0, secondChances):
        newWord = model[candidates[j][0]]
        if cosine(newX + newWord, Y) < cosine(newX, Y):
            logging.info('better performance with: {}'.format(candidates[j][0]))
            newWord = model[candidates[j][0]]
            break
        else:
            logging.info('worse performance with: {}'.format(candidates[j][0]))
    else:
        logging.info('Optimization stopped after {} attempts'.format(secondChances))
        break
    
    
    logging.info('{} best word: {}'.format(i, candidates[j][0]))
    
    path.append(newWord)
    newX = newX + newWord
    logging.info('cosine distance: {}'.format(cosine(newX, Y)))
    

# artificial example/manual algorithm

In [None]:
%pylab inline
import matplotlib.pyplot as plt

X = docvecs['VfS_2010_pid_1003.pdf']
A = docvecs['02027.pdf']
diff = A - X

grok = tsne(np.array([X,A,diff]), metric=myCosine)
grok = grok * 10000

plt.scatter(grok[:, 0], grok[:, 1], s=50, c=docColors, alpha=.7, cmap=plt.cm.Spectral)
plt.title("t-SNE")

plt.show()

In [None]:
%pylab inline
import matplotlib.pyplot as plt

def getColorRange(d=4):
    '''
    return d colors between black and white
    '''
    return ['#000000'] + ['#' + hex(int(floor(i*255./d))).split('x')[1] * 3 for i in range(1, d+1)]

X = docvecs['VfS_2010_pid_1003.pdf']
A = docvecs['02027.pdf']

diff = []
diff.append(A - X)
diff.append(diff[-1] - model['jobcareer'])
diff.append(diff[-1] - model['thatcarbon'])
diff.append(diff[-1] + model['empire'])
diff.append(diff[-1] + model['goldsmithpinkhamand'])
diff.append(diff[-1] - model['countryimmigrant'])
diff.append(diff[-1] + model['interestsi'])

grok = tsne([X,A,*diff], metric='euclidean')
grok = grok * 10000
docColors = ['red', 'green'] + list(reversed(getColorRange(len(diff)-1)))

plt.scatter(grok[:, 0], grok[:, 1], s=500, c=docColors, alpha=1, cmap=plt.cm.Spectral)
plt.title("t-SNE")

plt.show()

# Using Wikipedia Corpus

Inspired by [this](https://radimrehurek.com/gensim/wiki.html)

In [1]:
import logging
import gensim
import bz2
from os.path import join
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
                    level=logging.INFO)

## loading corpus from disk

In [2]:
corpus_dir = 'data/wiki_corpus'
# load id->word mapping (the dictionary)
id2word = gensim.corpora.Dictionary.load_from_text(
    bz2.BZ2File(join(corpus_dir, '._wordids.txt.bz2')))

In [3]:
from gensim.models.doc2vec import TaggedDocument
from gensim.corpora import WikiCorpus
import json
from tqdm import tqdm
from os.path import isfile
import random

class Document_Generator:
    """
    Reads a wikipedia dump and generates TaggedDocuments as used
    by gensim.
    """
    def __init__(self, corpus_file=None, cache_file=None, \
                 p=1, dictionary=None):
        """
        Reads the wikipedia dump passed bycorpus_file, transforms each 
        article to a json entry in cache_file, using the dictionary 
        provided in dictionary. p limits the max number of documents
        being processed. Document are randomly selected with the 
        probability p.
        """
        try:
            assert corpus_file != None and \
                   cache_file != None and \
                   dictionary != None
        except AssertionError:
            raise AttributeError('corpus_file, cache_file and ' +
                                 'dictionary must be provided')
        
        self.p = p
        self.dictionary = dictionary
        self.cache_file = cache_file
        wiki_corpus = WikiCorpus(corpus_file, lemmatize=False, \
                                 dictionary=dictionary)
        wiki_corpus.metadata = True # to get the articles name and ID

        # if file doestn't exists, initialize
        if not isfile(cache_file):
            with open(cache_file, 'a') as fh:
                for article in tqdm(wiki_corpus.get_texts()):
                    entry = {}
                    text = ' '.join((a.decode('utf8') \
                                     for a in article[0]))
                    ID = article[1][0]
                    name = article[1][1]
                    entry[name] = { 'id': ID, 'text': text}
                    fh.write(json.dumps(entry) + '\n')

    def __iter__(self):
        """ Yields TaggedDocuments using a generator that yields 
        (<ID>, <Text>) tuples.
        """
        with open(self.cache_file, 'r') as fh:
            for i, line in enumerate(fh):
                entry = json.loads(line)
                if self._true_false_gen():
                    for k in entry.keys():
                        words = entry[k]['text'].split()
                        yield TaggedDocument(words=words, \
                                             tags=[k])
                    
        raise StopIteration
        
    def _true_false_gen(self, p = .5):
        """
        Generates True or False with the probability p and 1-p,
        respectively
        """
        return random.random() < self.p

In [13]:
# write subset of the corpus to disk
cache = join(corpus_dir, 'raw', 'all_articles.json')
cache_subset = join(corpus_dir, 'raw', 'all_articles_subset_0.1.json')

def true_false_gen(p = .5):
    return random.random() < p

with open(cache, 'r') as fh_r, open(cache_subset, 'w+') as fh_w:
    for i, line in tqdm(enumerate(fh_r)):
        if true_false_gen(p=0.1):
            fh_w.writelines(line)



In [5]:
from gensim.models import Doc2Vec
import os

def load_wikipedia_model(path=None):
    if os.path.exists(path):
        return Doc2Vec.load(path)
    else:
        return None

In [None]:
import logging
from os.path import join, exists

logging.getLogger().setLevel(logging.INFO)
dim = 1000
corpus_dir = 'data/wiki_corpus/'
model_file_name = '{}_dim_wiki_doc2vec.model'.format(dim)
model_path = join(corpus_dir, model_file_name)
corpus = join(corpus_dir, 'raw', \
              'enwiki-20160701-pages-articles-multistream.xml.bz2')
# cache = join(corpus_dir, 'raw', 'all_articles.json')
cache = join(corpus_dir, 'raw', 'all_articles_subset_0.1.json')
model = load_wikipedia_model(model_path)

if model == None:
    model = Doc2Vec(size=dim, window=10, min_count=3, workers=4, \
                            alpha=0.025, min_alpha=0.025)

    doc_iter = Document_Generator(corpus_file=corpus, cache_file=cache, \
                                  dictionary=id2word, p=1)
    logging.info('Start building vocabulary')
    model.build_vocab(doc_iter)
    logging.info('Vocabulary built successfully')

    for i, epoch in enumerate(range(3)):
        logging.info('beginning interation #' + str(i) + '\n')
        model.train(doc_iter)
        model.alpha -= 0.002 # decrease the learning rate
        model.min_alpha = model.alpha # fix the learning rate, no decay
    
    logging.info('Persisting model')
    model.save(join(corpus_dir, '{}_dim_wiki_doc2vec.model'.format(dim)))
    logging.info('done')

INFO:root:Start building vocabulary
INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #10000, processed 16649919 words (3150941/s), 306061 word types, 10000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #20000, processed 29761415 words (3138122/s), 472884 word types, 20000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #30000, processed 40081111 words (3055727/s), 583788 word types, 30000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #40000, processed 48635635 words (3006482/s), 670771 word types, 40000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #50000, processed 56442760 words (2978155/s), 749942 word types, 50000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #60000, processed 63949256 words (3021977/s), 820641 word types, 60000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #70000, p

In [60]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from tqdm import tqdm
import logging

class Wordvec_Approximator:
    def __init__(self, model=None):
        if model == None:
            raise TypeError('You need to pass a model')
        self.model = model

    def word_similarities(self, a):
        '''
        Get similarities for each word in the corpus
        '''
        # if vocab is None, the default vocab model.vocab is used
        vocab = ''
        if self.vocab == None:
            vocab = model.vocab
        else:
            vocab = self.vocab
            
        for word in vocab.keys():
            sim = cosine_similarity(a.reshape(1,-1), self.model[word]
                                     .reshape(1,-1)).flatten()[0]
            yield word, sim

    def most_similar_words(self, X, Y, top=30):
        '''
        Find the wordvec wv that minimizes distance(X+wv, Y)
        '''
        diff = Y - X
        sims = []
        for sim in tqdm(self.word_similarities(diff)):
            sims.append(sim)
        sims = sorted(sims, key=lambda x: -x[1])
        return sims[0:top]

    def approximate_difference(self, X, Y, vocab=None):
        '''
        Compute the word2vec similarity path from X to Y
        '''
        # if vocab isn't passed, model.vocab will be used
        self.vocab = vocab
        
        X = X.reshape(1, -1)
        Y = Y.reshape(1, -1)
        newX = X
        path = [] # keeps track of the vectors that make up the difference

        logging.info('initial distance is: {}'.format(cosine_similarity(X, Y)))
        for i in tqdm(range(20)):
            candidates = self.most_similar_words(newX, Y)

            # is the performance better now?
            # if not, check the next candidate
            secondChances = 10
            newWord = ""
            for j in range(0, secondChances):
                newWord = self.model[candidates[j][0]]
                if cosine_similarity(newX + newWord, Y) > cosine_similarity(newX, Y):
                    logging.info('better performance with: {}'
                                 .format(candidates[j][0]))
                    newWord = self.model[candidates[j][0]]
                    break
                else:
                    logging.info('worse performance with: {}'
                                 .format(candidates[j][0]))
            else:
                logging.info('Optimization stopped after {} attempts'
                             .format(secondChances))
                break

            logging.info('{} best word: {}'.format(i, candidates[j][0]))
            path.append(candidates[j][0])
            newX = newX + newWord
            logging.info('cosine similarity: {}'.format(cosine_similarity(newX, Y)))
        return path

NameError: name 'X' is not defined