In [153]:
import numpy as np
from scipy.spatial.distance import pdist, cdist
from gensim import utils

In [227]:
from gensim.corpora import TextCorpus
corpus = TextCorpus('./jobspicker/jobspicker-descriptions.csv')
corpus.dictionary.filter_extremes(no_below=4, no_above=.9, keep_n=100000)

In [45]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec.load("profiles.model")

In [266]:
DEFAULT_SAMPLE_SIZE = 50

def sample(w, size, pwr = 1.5):
    t,f = w[:,0], w[:,1]**pwr
    p = f/np.linalg.norm(f, 1)
    return np.random.choice(t, size = size, replace = True, p = p)

def get_wv(model, w):
    try:
        return model[w]
    except KeyError:
        return None
        
def doc_vec(doc, model, corpus, size = DEFAULT_SAMPLE_SIZE, tfidf = None, count = 1):
    """ Creates a document vector """

    tfidf = tfidf or TfidfModel(corpus)

    # w is Dx2 array with word id and tfidf score
    w = np.array(tfidf[corpus.dictionary.doc2bow(doc)])
    
    # sample according to tfidf scores and get vectors,
    # filter all not-found words
    vecs = [get_wv(model, corpus.dictionary[x]) for x in sample(w, size)]
    vecs = [v for v in vecs if v is not None]

    # Handling the cases when we find very few words from a document
    # in our externally trained model vocabulary
    if len(vecs) < .5*size:
        if count < 5:
            return doc_vec(doc, model, corpus, size, tfidf, count + 1)
        else:
            raise KeyError("Cannot find any of these words in the vocabulary: " + " ".join(doc))

    # Just take the mean of the vec of all the sampled words from the document
    return np.mean(vecs, 0)

def corpus_vec(docs, model, corpus, size = DEFAULT_SAMPLE_SIZE):
    """ Creates a NxD array of document vectors for each document in a list"""

    tfidf = TfidfModel(corpus)
    N,D = len(docs), model.wv.syn0.shape[1]
    arr = np.empty((N, D))
    for i in range(N):
        arr[i,:] = doc_vec(docs[i], model, corpus, size, tfidf)
    return arr

def get_closest_doc(v, cv, docs):
    """ given a vector and 2D array of corpus vectors gives best cv"""
    v = np.array([doc_vec(doc, model, corpus)])
    d = np.argsort(cdist(v, cv)[0,:])
    i = d[0]
    return docs[i]

In [267]:
sentences = [list(g) for g in list(corpus.get_texts())]
corp_vecs = corpus_vec(sentences, model, corpus)

In [268]:
doc = list(utils.tokenize("outdoor independent in the wilderness working alone and self-motivated to work with animals wildlife"))
a = get_closest_doc(doc, corp_vecs, sentences)
" ".join(a)

u'mba or graduate degree proactive and solutions oriented entrepreneurial spirit ability to try different things with minimal direction established track record of creativity and innovation fluency in french experience marketing in canada amazon marketing experience benefitswe have a comprehensive benefits package but here are some of the fun perks free catered lunch daily all the snacks you can eat and happy hour on fridays free onsite gym and fitness reimbursement ruthless ping pong tournaments and seasonal sports clubs volleyball basketball soccer etc endless company sponsored events transit reimbursement paid parking and shuttle service tuition assistance annual amazon discount and much more audible inc is the world s largest seller and producer of spoken audio entertainment information and educational programming since inventing and commercializing the first portable digital audio player in our focus on technological innovation and superior programming has earned us millions of su