In [1]:
from gensim.models import word2vec, Word2Vec, Doc2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, LabeledSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import re
import time
import jobs, applicants
import topic_model, nlp_utils

# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [2]:
a = topic_model.get_stop_words()

In [3]:
# Vetorizer params
min_df=20
max_df=.25
max_vocab_size=50000
dim_size=500
ngram_range=(1, 1)

# Vectorizer to be used throughout
vectorizer = TfidfVectorizer(token_pattern=topic_model.get_token_pattern(),
                                              min_df=min_df,
                                              max_df=max_df,
                                              max_features=max_vocab_size,
                                              stop_words=topic_model.get_stop_words(),
                                              ngram_range=ngram_range)

In [8]:
# Get job postings, vectorizer, token generator
df_jobs = jobs.get_job_posting_data()
df_jobs = df_jobs[20000:].reset_index()
print df_jobs.shape
docs = df_jobs.description

## Dont need a token generator anymore if we get implement the doc2vec model 
## with calls to build vocab and train using labeled sentence tokens
# docgen = topic_model.TokenGenerator_skl(vectorizer, docs, topic_model.get_stop_words())
# docgen

Getting job posting data...
- Time: 0.529675960541

(4015, 6)


In [9]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 500)
df_jobs.iloc[0]

index                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        20000
id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [10]:
# Set values for various parameters for doc2vec
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# ----
# Doc2Vec
# ----
In word2vec, the parameter “continuous bag of words” (cbow) and “skip-gram” (sg); 
in the doc2vec architecture, the corresponding algorithms are “distributed memory” (dm) and “distributed bag of words” (dbow)

In [11]:
# Tokenize our docs
d2v_ut = nlp_utils.D2V_Utils(vectorizer)
docs_as_tokens = d2v_ut.get_tokenized_docs(docs)
# Label our doc tokens
labeled_docs = d2v_ut.get_tagged_docs(docs_as_tokens, 'job_posting')

Tokenizing docs...
- Time: 5.44096589088

Creating LabeledSentences...
- Time: 0.018s.


In [24]:
# labeled_docs[0]

In [25]:
print "\nCreating Doc2Vec model...\n"
t1 = time.time()
d2v_model = Doc2Vec(size=300, 
                    dbow_words= 1, 
                    window=10,
                    min_count=5, 
                    workers=4)
d2v_model.build_vocab(labeled_docs)

t1 = time.time()
d2v_model.train(labeled_docs, 
                total_examples=len(labeled_docs),
                epochs=20)
print "- Time: %0.3fs." % (time.time() - t1)

2018-01-09 04:42:18,743 : INFO : collecting all words and their counts
2018-01-09 04:42:18,745 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags



Creating Doc2Vec model...



2018-01-09 04:42:19,072 : INFO : PROGRESS: at example #10000, processed 1053349 words (3228063/s), 26006 word types, 10000 tags
2018-01-09 04:42:19,382 : INFO : PROGRESS: at example #20000, processed 2076333 words (3315229/s), 36904 word types, 20000 tags
2018-01-09 04:42:19,504 : INFO : collected 40557 word types and 24015 unique tags from a corpus of 24015 examples and 2464929 words
2018-01-09 04:42:19,505 : INFO : Loading a fresh vocabulary
2018-01-09 04:42:19,557 : INFO : min_count=5 retains 14061 unique words (34% of original 40557, drops 26496)
2018-01-09 04:42:19,558 : INFO : min_count=5 leaves 2422403 word corpus (98% of original 2464929, drops 42526)
2018-01-09 04:42:19,672 : INFO : deleting the raw counts dictionary of 40557 items
2018-01-09 04:42:19,675 : INFO : sample=0.001 downsamples 11 most-common words
2018-01-09 04:42:19,677 : INFO : downsampling leaves estimated 2402599 word corpus (99.2% of prior 2422403)
2018-01-09 04:42:19,678 : INFO : estimated required memory for

2018-01-09 04:43:31,260 : INFO : PROGRESS: at 39.27% examples, 268658 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:43:32,283 : INFO : PROGRESS: at 39.82% examples, 268626 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:43:33,305 : INFO : PROGRESS: at 40.40% examples, 268747 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:43:34,306 : INFO : PROGRESS: at 40.97% examples, 268821 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:43:35,344 : INFO : PROGRESS: at 41.57% examples, 268630 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:43:36,358 : INFO : PROGRESS: at 42.09% examples, 268883 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:43:37,417 : INFO : PROGRESS: at 42.66% examples, 268870 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:43:38,427 : INFO : PROGRESS: at 43.26% examples, 268896 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:43:39,456 : INFO : PROGRESS: at 43.87% examples, 268978 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:43:40,465 : INFO : PROGRESS: at 44.45% examples, 268996 wor

2018-01-09 04:44:53,353 : INFO : PROGRESS: at 85.41% examples, 270923 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:44:54,357 : INFO : PROGRESS: at 85.99% examples, 270997 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:44:55,396 : INFO : PROGRESS: at 86.61% examples, 270945 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:44:56,432 : INFO : PROGRESS: at 87.12% examples, 271081 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:44:57,462 : INFO : PROGRESS: at 87.73% examples, 271111 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:44:58,497 : INFO : PROGRESS: at 88.36% examples, 271136 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:44:59,530 : INFO : PROGRESS: at 88.94% examples, 271085 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:45:00,566 : INFO : PROGRESS: at 89.51% examples, 271037 words/s, in_qsize 8, out_qsize 0
2018-01-09 04:45:01,578 : INFO : PROGRESS: at 90.09% examples, 271093 words/s, in_qsize 7, out_qsize 0
2018-01-09 04:45:02,582 : INFO : PROGRESS: at 90.64% examples, 271040 wor

- Time: 178.872s.


# Make Recommendation

In [26]:
id_str = 'job_posting_10'
docsim = d2v_model.docvecs.most_similar(id_str)
docsim

2018-01-09 04:45:19,147 : INFO : precomputing L2-norms of doc weight vectors


[('job_posting_6917', 0.7508301734924316),
 ('job_posting_13270', 0.7499496340751648),
 ('job_posting_12357', 0.7456464767456055),
 ('job_posting_5852', 0.7208945155143738),
 ('job_posting_1962', 0.7184603214263916),
 ('job_posting_12246', 0.716362714767456),
 ('job_posting_17451', 0.714232325553894),
 ('job_posting_11426', 0.7122789621353149),
 ('job_posting_8534', 0.7092502117156982),
 ('job_posting_8414', 0.7085880041122437)]

In [11]:
d2v_model.similar_by_word('chef')

  """Entry point for launching an IPython kernel.
2018-01-09 03:34:06,193 : INFO : precomputing L2-norms of word weight vectors


[(u'mediterranean', 0.514112114906311),
 (u'steak', 0.5054657459259033),
 (u'michelin', 0.5049415826797485),
 (u'fish', 0.5048666000366211),
 (u'florentine', 0.5015060305595398),
 (u'opentable', 0.4838712513446808),
 (u'bistro', 0.4608152210712433),
 (u'pasta', 0.4505152702331543),
 (u'sauce', 0.4236881136894226),
 (u'cocktail', 0.41957467794418335)]

# Applicant Recommender

## Applicant resume data query on jobs

In [12]:
# Get applicant data, vectorizer, token generator
df_appl = applicants.get_applicant_data()
print df_appl.shape

df_appl.head()

Getting user and applicant data...
- Time: 4.78302788734

Processing user resume data...
Tokenize failed for user id 16408, error: Failed to parse QName 'http:', line 675, column 179 (line 675)
- Time: 50.5367798805

Done.
(183872, 14)


Unnamed: 0,id,first_name,last_name,parsed_resume_xml,passions,introduction,job_tag_id,tag_type,resume_elements,res_executiveSummary,res_description,res_title,res_competency,res_degree
0,87468,Angelica,Espinoza,"<Resume xmlns=""http://ns.hr-xml.org/2006-02-28...",[],,,,"{u'Competency': u'ANSWERING, CASH, CASHIER, CA...",,"Massage Therapist/Receptionist\n* 30,60 and 80...",,"ANSWERING, CASH, CASHIER, CASHIERING, CREDIT, ...",certification
1,88235,Julie,Nguyen,"<Resume xmlns=""http://ns.hr-xml.org/2006-02-28...",,,,,"{u'Competency': u'BILLING, BIOLOGICAL SCIENCES...",,- Work on an interdisciplinary team to provide...,,"BILLING, BIOLOGICAL SCIENCES, LIAISON, NEUROSC...",bachelors
2,88237,Aparna,Upadhyaya,"<Resume xmlns=""http://ns.hr-xml.org/2006-02-28...","[Cooking, travel, dance, yoga, pets]",,,,"{u'Competency': u'APEX, BENEFITS, CAM, CATHETE...",,● Assisting in providing a comprehensive plan ...,,"APEX, BENEFITS, CAM, CATHETER, CATHETERS, CHEM...",bachelors
3,88249,Janet,Donohue,"<Resume xmlns=""http://ns.hr-xml.org/2006-02-28...",,,,,"{u'Competency': u'A/R, ACCOUNT RECONCILIATION,...",,* Monitor and review credit limit and payment ...,,"A/R, ACCOUNT RECONCILIATION, ACCOUNT RECONCILI...",bachelors
4,150252,,,,[],"I am a hard worker like to learn,teachable,alw...",,,{},,,,,


In [53]:
u = df_appl.iloc[1]
print u

u_res_desc = u.res_description
print u_res_desc

# convert a user resume to a doc2vec TaggedDocument to run on the jobs model
words = u_res_desc.split()
proc_words = [lemmer.lemmatize(w) for w in list(set(analyzer(doc)) - set(sw)) if pattern.match(w)]
labeled_doc = LabeledSentence(words=proc_words, tags=['user_1'])


id                                                                  88235
first_name                                                          Julie
last_name                                                          Nguyen
parsed_resume_xml       <Resume xmlns="http://ns.hr-xml.org/2006-02-28...
passions                                                             None
introduction                                                         None
job_tag_id                                                            NaN
tag_type                                                              NaN
resume_elements         {u'Competency': u'BILLING, BIOLOGICAL SCIENCES...
res_executiveSummary                                                 None
res_description         - Work on an interdisciplinary team to provide...
res_title                                                            None
res_competency          BILLING, BIOLOGICAL SCIENCES, LIAISON, NEUROSC...
res_degree                            

  # Remove the CWD from sys.path while we load stuff.


In [57]:
# d2v_model.similar_by_word(u)
tokens = "Massage Therapist esthetician Receptionist store greeter".split()
# tokens = proc_words

new_vector = d2v_model.infer_vector(tokens)
#new_vector = d2v_model.infer_vector(labeled_doc)
sims = d2v_model.docvecs.most_similar([new_vector]) #gives you top 10 document tags and their cosine similarity
sims

[('SENT_6309', 0.5231666564941406),
 ('SENT_2860', 0.4562077522277832),
 ('SENT_19929', 0.42022114992141724),
 ('SENT_10245', 0.36824649572372437),
 ('SENT_17375', 0.3603281080722809),
 ('SENT_9134', 0.35061541199684143),
 ('SENT_7316', 0.3451305627822876),
 ('SENT_16119', 0.34335872530937195),
 ('SENT_15130', 0.3404138684272766),
 ('SENT_2244', 0.3388136923313141)]

In [61]:
d2v_model.similar_by_word('massage')

  """Entry point for launching an IPython kernel.


[(u'esthetician', 0.6112874150276184),
 (u'lash', 0.5931062698364258),
 (u'cosmetologist', 0.5857604146003723),
 (u'camtc', 0.5329710841178894),
 (u'mani', 0.521687924861908),
 (u'eyebrow', 0.5181446671485901),
 (u'facial', 0.514720618724823),
 (u'manicure', 0.5129488706588745),
 (u'grooming', 0.5102207660675049),
 (u'complimentary', 0.49629977345466614)]

In [62]:
df_jobs.iloc[19929].description

'Helen Miller Bridal flagship store located in San Francisco is looking for a Bridal Stylist/Sales to join our team.  \nHere, at Helen Miller Bridal, we strive to make each experience with every bride one to remember. Customer Service & Sales Experience are not required but greatly appreciated. This is a part-time position starting out and mostly on weekends, but has the potential to grow into a full-time position.'

## Applicant to applicant

In [87]:
# Preprocess the appicant resumes as labeled doc tokens
docs = df_appl.res_description
docs_as_tokens = get_tokenized_docs(docs)
labeled_docs = get_labeled_sentences(docs_as_tokens, 'appl_')

Tokenizing docs
- Time: 14.8597640991

Creating LabeledSentences...
- Time: 0.302s.


  


In [90]:
print len(labeled_docs)
labeled_docs[0]

13869


TaggedDocument(words=[u'operation', u'material', u'managed', u'office', u'money', u'organized', u'year', u'drawer', u'therapist', u'receptionist', u'entering', u'manner', u'counting', u'technology', u'exam', u'angelica', u'information', u'clerical', u'monitoring', u'answering', u'mid', u'establishment', u'payment', u'form', u'going', u'minute', u'objective', u'consultation', u'administrative', u'correct', u'booking', u'greeting', u'file', u'timely', u'customer', u'patient', u'complete', u'match', u'completed', u'enter', u'paper', u'phone', u'appointment', u'management', u'determination', u'documented', u'beginning', u'appears', u'change', u'exposure', u'concentrated', u'massage', u'lower', u'debit', u'room', u'call', u'level', u'receive', u'client', u'message', u'cash', u'espinoza', u'specific', u'credit', u'filling', u'contact', u'ensure', u'guiding', u'contain', u'card', u'did', u'automatic', u'massage', u'software'], tags=['appl__0'])

In [None]:
print "\nCreating Doc2Vec model...\n"
t1 = time.time()
d2v_appl_model = Doc2Vec(size=300, 
                    dbow_words= 1, 
                    window=10,
                    min_count=5, 
                    workers=4)
d2v_appl_model.build_vocab(labeled_docs)
print "- Time: %0.3fs." % (time.time() - t1)

print "\nCreating Doc2Vec model...\n"
t1 = time.time()
d2v_model.train(labeled_docs, 
                total_examples=len(labeled_docs),
                epochs=20)
print "- Time: %0.3fs." % (time.time() - t1)

2018-01-09 04:22:15,400 : INFO : collecting all words and their counts
2018-01-09 04:22:15,405 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags



Creating Doc2Vec model...



2018-01-09 04:22:15,780 : INFO : PROGRESS: at example #10000, processed 1210693 words (3252550/s), 49509 word types, 10000 tags
2018-01-09 04:22:15,924 : INFO : collected 59845 word types and 13869 unique tags from a corpus of 13869 examples and 1672789 words
2018-01-09 04:22:15,925 : INFO : Loading a fresh vocabulary
2018-01-09 04:22:15,983 : INFO : min_count=5 retains 13722 unique words (22% of original 59845, drops 46123)
2018-01-09 04:22:15,984 : INFO : min_count=5 leaves 1608114 word corpus (96% of original 1672789, drops 64675)
2018-01-09 04:22:16,023 : INFO : deleting the raw counts dictionary of 59845 items
2018-01-09 04:22:16,027 : INFO : sample=0.001 downsamples 22 most-common words
2018-01-09 04:22:16,030 : INFO : downsampling leaves estimated 1551943 word corpus (96.5% of prior 1608114)
2018-01-09 04:22:16,031 : INFO : estimated required memory for 13722 words and 300 dimensions: 59210400 bytes
2018-01-09 04:22:16,074 : INFO : resetting layer weights
2018-01-09 04:22:16,460

# TEST

In [None]:
docs = df_appl.description[0:20000]
docs = df_appl.description

docgen = topic_model.TokenGenerator_skl(vectorizer, docs, topic_model.get_stop_words())
docgen
type(docgen)

In [None]:
# Interesting:
# Check that the document vector after training is different than the one before training. 

# prepare documents by TaggedDocument
# docs = ...

# an article tag
id_str = 'SENT_3'

# initialize a model
model = Doc2Vec(size=300, window=20, min_count=2, workers=8, alpha=0.025, min_alpha=0.01, dm=0)

# build vocabulary
model.build_vocab(labeled_docs)

In [23]:
# get the initial document vector
docvec1 = model.docvecs[0]
docvecsyn1 = model.docvecs.doctag_syn0[0]

# calculate most similar documents
# (the model is not trained, so the results should be wrong)
docsim1 = model.docvecs.most_similar(id_str)

print 'docvec1: %s' % docvec1[:20]
print 'docvecsyn1: %s' % docvecsyn1[:20]
docsim1

docvec1: [ -5.78184205e-04  -1.61719881e-03   1.56266044e-03  -2.00020822e-05
   7.59142451e-04   5.63495560e-04  -3.32428259e-04  -9.13201016e-04
  -1.43482396e-03   9.33754782e-04  -1.60543714e-03   6.80618919e-04
  -1.38343510e-03  -1.50158152e-03   1.46248657e-03  -1.34379754e-03
   6.69001078e-04  -9.09951050e-04  -1.17913017e-03   1.42662937e-03]
docvecsyn1: [ -5.78184205e-04  -1.61719881e-03   1.56266044e-03  -2.00020822e-05
   7.59142451e-04   5.63495560e-04  -3.32428259e-04  -9.13201016e-04
  -1.43482396e-03   9.33754782e-04  -1.60543714e-03   6.80618919e-04
  -1.38343510e-03  -1.50158152e-03   1.46248657e-03  -1.34379754e-03
   6.69001078e-04  -9.09951050e-04  -1.17913017e-03   1.42662937e-03]


[('SENT_22468', 0.2260284423828125),
 ('SENT_16903', 0.21625030040740967),
 ('SENT_13529', 0.2105707973241806),
 ('SENT_9124', 0.21006619930267334),
 ('SENT_23702', 0.20848970115184784),
 ('SENT_2737', 0.2053980678319931),
 ('SENT_6350', 0.2043028175830841),
 ('SENT_9955', 0.2029171884059906),
 ('SENT_13556', 0.19606398046016693),
 ('SENT_13329', 0.19407835602760315)]

In [24]:
# train this model
# model.train(docs, 
#             total_examples=len(docs), 
#             epochs=20)
model.train(labeled_docs, 
            total_examples=len(labeled_docs),
            epochs=20)

2018-01-08 06:16:08,169 : INFO : training model with 8 workers on 23856 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=20
2018-01-08 06:16:09,194 : INFO : PROGRESS: at 0.70% examples, 342635 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:16:10,253 : INFO : PROGRESS: at 1.62% examples, 381370 words/s, in_qsize 16, out_qsize 0
2018-01-08 06:16:11,263 : INFO : PROGRESS: at 2.42% examples, 403432 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:16:12,286 : INFO : PROGRESS: at 3.38% examples, 408861 words/s, in_qsize 16, out_qsize 0
2018-01-08 06:16:13,289 : INFO : PROGRESS: at 4.28% examples, 413580 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:16:14,291 : INFO : PROGRESS: at 5.19% examples, 418564 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:16:15,301 : INFO : PROGRESS: at 6.04% examples, 420089 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:16:16,325 : INFO : PROGRESS: at 6.98% examples, 421666 words/s, in_qsize 16, out_qsize 0
2018-01-08 06:16:17,334 :

2018-01-08 06:17:28,838 : INFO : PROGRESS: at 70.93% examples, 431666 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:17:29,857 : INFO : PROGRESS: at 71.84% examples, 431469 words/s, in_qsize 16, out_qsize 0
2018-01-08 06:17:30,880 : INFO : PROGRESS: at 72.75% examples, 431974 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:17:31,891 : INFO : PROGRESS: at 73.65% examples, 431967 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:17:32,899 : INFO : PROGRESS: at 74.56% examples, 431817 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:17:33,901 : INFO : PROGRESS: at 75.41% examples, 431834 words/s, in_qsize 15, out_qsize 1
2018-01-08 06:17:34,900 : INFO : PROGRESS: at 76.31% examples, 431750 words/s, in_qsize 16, out_qsize 0
2018-01-08 06:17:35,925 : INFO : PROGRESS: at 77.14% examples, 431873 words/s, in_qsize 15, out_qsize 0
2018-01-08 06:17:36,943 : INFO : PROGRESS: at 78.11% examples, 432043 words/s, in_qsize 16, out_qsize 0
2018-01-08 06:17:37,951 : INFO : PROGRESS: at 79.00% examples, 4

49059199

In [28]:
# get the trained document vector
docvec2 = model.docvecs[0]
docvecsyn2 = model.docvecs.doctag_syn0[0]

# calculate most similar documents
# (we expect the results to be correct)
docsim2 = model.docvecs.most_similar(id_str)

## THIS NOW SHOWS THE BETTER RESULTS AS THE MODEL IS NOW TRAINED
print 'docvec2: %s' % docvec2[:20]
print 'docvecsyn2: %s' % docvecsyn2[:20]
docsim2

docvec2: [-0.07173967 -0.00501047  0.07555088  0.16436286 -0.36748344 -0.01681295
  0.02698558 -0.18099998  0.23837043  0.08042304 -0.01240299  0.24280906
  0.18474372  0.09306788 -0.28878078  0.06376255  0.08662205  0.16764431
  0.11876337  0.15184912]
docvecsyn2: [-0.07173967 -0.00501047  0.07555088  0.16436286 -0.36748344 -0.01681295
  0.02698558 -0.18099998  0.23837043  0.08042304 -0.01240299  0.24280906
  0.18474372  0.09306788 -0.28878078  0.06376255  0.08662205  0.16764431
  0.11876337  0.15184912]


[('SENT_17542', 0.9056956768035889),
 ('SENT_2676', 0.8908694982528687),
 ('SENT_21091', 0.8898348808288574),
 ('SENT_18791', 0.8878933191299438),
 ('SENT_13271', 0.8822087049484253),
 ('SENT_14911', 0.881842851638794),
 ('SENT_22648', 0.8797429800033569),
 ('SENT_11599', 0.877663254737854),
 ('SENT_21024', 0.8768661022186279),
 ('SENT_21117', 0.8742849230766296)]

In [32]:
# choose one document
doc = labeled_docs[0].words

# infer vectors with different 'steps' parameters
infervec1 = model.infer_vector(doc, alpha=0.025, min_alpha=0.01, steps=1)
infervec2 = model.infer_vector(doc, alpha=0.025, min_alpha=0.01, steps=10)
infervec3 = model.infer_vector(doc, alpha=0.025, min_alpha=0.01, steps=100)
infervec4 = model.infer_vector(doc, alpha=0.025, min_alpha=0.01, steps=1000)

# print results

# document vector
print('\nDocument vector:')

# we can see that the document vectors do not change after training.
print(docvec1[:5])
print(docvec2[:5])
print(docvecsyn1[:5])
print(docvecsyn2[:5])

# most similar documents
print('\nMost similar:')

# before training, the result is wrong. after training, correct. good.
print(docsim1[:2])
print(docsim2[:2])

# infered vectors with different 'steps' parameters
print('\nInfered vector:')

# we can see that, they are quite different.
print(infervec1[:5])
print(infervec2[:5])
print(infervec3[:5])
print(infervec4[:5])

# norm of inferred vectors
print('\nNorm of infered vector:')

# it seems that, the norm of inferred vectors would be larger for bigger steps
print(np.linalg.norm(infervec1))
print(np.linalg.norm(infervec2))
print(np.linalg.norm(infervec3))
print(np.linalg.norm(infervec4))


Document vector:
[-0.07173967 -0.00501047  0.07555088  0.16436286 -0.36748344]
[-0.07173967 -0.00501047  0.07555088  0.16436286 -0.36748344]
[-0.07173967 -0.00501047  0.07555088  0.16436286 -0.36748344]
[-0.07173967 -0.00501047  0.07555088  0.16436286 -0.36748344]

Most similar:
[('SENT_22468', 0.2260284423828125), ('SENT_16903', 0.21625030040740967)]
[('SENT_17542', 0.9056956768035889), ('SENT_2676', 0.8908694982528687)]

Infered vector:
[ 0.03901744  0.04886983 -0.03233307  0.06236155 -0.16637127]
[-0.01654357  0.08957674  0.05339311  0.14227058 -0.30839854]
[-0.23123467  0.13793825  0.32103819  0.19679461 -0.34883693]
[-0.43890652  0.61093193  0.54345572  0.24288487 -0.09800234]

Norm of infered vector:
1.48904
2.93619
5.82037
10.7574


In [35]:
d = labeled_docs[0]
docsim2

[('SENT_17542', 0.9056956768035889),
 ('SENT_2676', 0.8908694982528687),
 ('SENT_21091', 0.8898348808288574),
 ('SENT_18791', 0.8878933191299438),
 ('SENT_13271', 0.8822087049484253),
 ('SENT_14911', 0.881842851638794),
 ('SENT_22648', 0.8797429800033569),
 ('SENT_11599', 0.877663254737854),
 ('SENT_21024', 0.8768661022186279),
 ('SENT_21117', 0.8742849230766296)]

In [36]:
docs[0]

"Le Marais Bakery is expanding to Ghirardelli Square, and is seeking motivated, enthusiastic baristas to join their team. We offer training and support from Stumptown Coffee Roasters. Part-time and full-time positions available.\n\nCandidates should have a genuine sense of hospitality, and a love and knowledge of baked items and coffee. All positions offer amazing tips and the opportunity to work with a great team, along with appreciative customers and a welcoming atmosphere.\n\nJob Requirements:\nExcel in customer service\nThe ability to thrive in a fast paced environment\nExisting CA Food Handler's Certification or certification within 30 days post-hire\nWork well in a team setting\n\nWith our upcoming expansion to Ghirardelli Square, there are many opportunities for growth and advancement.\n"

In [52]:
docs[21024]

'Line cook/prep'

In [53]:
df_jobs.iloc[21117].description

'Hiring experienced server, line cook, server, and dishwasher'