In [1]:
from bs4 import BeautifulSoup  # For HTML parsing
import re  # Regular expressions
from time import sleep  # To prevent overwhelming the server between connections
import requests
import pickle
from pymongo import MongoClient, errors
import numpy as np

In [2]:
try:
    client = MongoClient()
    print ("Connected successfully!!!")
except pymongo.errors.ConnectionFailure:
    print ("Could not connect to MongoDB: %s" % e)
client

Connected successfully!!!


MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [3]:
db = client.jobs_database

In [4]:
coll = db.job_collection_all

In [5]:
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
for job in coll.find():
    print(job['title'])
    break

Echoing Green - Jobs: Fundraising Intern - Apply online


In [6]:
desc = []
for job in coll.find():
    desc.append(job['description'])

In [8]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(analyzer='word',
                                  ngram_range=(1, 8), stop_words='english',
                                  token_pattern='\\b[a-z][a-z]+\\b')
count_vectorizer.fit(desc)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 8), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
ng_vecs = count_vectorizer.transform(desc).transpose()
ng_vecs.shape

(1327288, 6641)

In [10]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(ng_vecs)

In [11]:
for item in count_vectorizer.vocabulary_.items():
    print(item)
    break

('graduation instruction news events calendars personal info', 505290)


In [12]:
count = 0
for item in corpus:
    count += 1
count

6641

In [14]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

NameError: name 'count_vectorizer' is not defined

## Topic modeling

#### LDA
At this point we can simply plow ahead in creating an LDA model. It requires our corpus of word counts, mapping of row ids to words, and the number of topics (3).

In [28]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus, id2word=id2word, num_topics=5, passes=10)

In [29]:
lda.print_topics(num_words=20, num_topics=5)

[(0,
  '0.009*linguist + 0.007*language + 0.007*data + 0.006*earnest + 0.006*self + 0.005*research + 0.005*job + 0.004*service + 0.004*caesars + 0.004*hart + 0.004*linguistics + 0.004*careers + 0.003*site + 0.003*career + 0.003*ntid + 0.003*sign + 0.003*st + 0.003*nmlsconsumeraccess + 0.003*nmls + 0.003*list'),
 (1,
  '0.039*loading + 0.015*wait + 0.012*content + 0.010*page + 0.002*requirements + 0.002*resume + 0.002*navy + 0.002*experience + 0.002*aperture + 0.002*documents + 0.002*data + 0.002*youll + 0.002*usajobs + 0.002*gov + 0.002*information + 0.002*application + 0.002*scientist + 0.002*www + 0.002*experienced + 0.001*apply'),
 (2,
  '0.011*data + 0.008*glassdoor + 0.006*job + 0.006*roche + 0.005*analysis + 0.005*memorial + 0.004*sequencing + 0.004*analytics + 0.003*machine + 0.003*healthcare + 0.003*santa + 0.003*design + 0.003*analytical + 0.003*share + 0.003*experience + 0.003*learning + 0.003*mining + 0.003*work + 0.003*search + 0.003*software'),
 (3,
  '0.013*ihs + 0.010*mi

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
# Create a CountVectorizer for parsing/counting words
tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1,1), stop_words='english',
                                  token_pattern='\\b[a-z][a-z]+\\b')

In [24]:
tfidf_vectorizer.fit(desc)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[a-z][a-z]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [25]:
ng_vecs = tfidf_vectorizer.transform(desc).transpose()
ng_vecs.shape

(18271, 6641)

In [26]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(ng_vecs)

In [27]:
id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())

#### ------------------------------------------------------------

## TFIDF
to include weights in words based on their frequency.

this is directly proportional to term count within documents

inverse proportional to term count over all docs

In [16]:
# Create a TFIDF transformer from our word counts (equivalent to "fit" in sklearn)
tfidf = models.TfidfModel(corpus)

In [17]:
# Create a TFIDF vector for all documents from the original corpus ("transform" in sklearn)
tfidf_corpus = tfidf[corpus]

In [30]:
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary.from_corpus(corpus,
                                    id2word=dict((id, word) for word, id in tfidf_vectorizer.vocabulary_.items()))

In [31]:
# Build an LSI space from the input TFIDF matrix, mapping of row id to word, and num_topics
# num_topics is the number of dimensions to reduce to after the SVD
# Analagous to "fit" in sklearn, it primes an LSI space
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=300)

In [33]:
# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[corpus]

In [34]:
# Dump the resulting document vectors into a list so we can take a look
doc_vecs = [doc for doc in lsi_corpus]

In [35]:
# Create an index transformer that calculates similarity based on our space
index = similarities.MatrixSimilarity(doc_vecs)



In [126]:
len(index[doc_vecs[1]])

6641

In [133]:
# Return the sorted list of cosine similarities to the first document
sims = sorted(enumerate(index[doc_vecs[1]]), key=lambda item: -item[1])
sims

[(1, 1.0),
 (534, 0.81351137),
 (785, 0.54778659),
 (451, 0.41783684),
 (10, 0.37823147),
 (15, 0.37576747),
 (229, 0.37576747),
 (44, 0.35867202),
 (317, 0.32694039),
 (232, 0.31805056),
 (153, 0.3164387),
 (37, 0.31203187),
 (69, 0.302993),
 (11, 0.27897072),
 (287, 0.26831579),
 (178, 0.26621825),
 (359, 0.25923824),
 (578, 0.25619298),
 (104, 0.25546476),
 (111, 0.25461599),
 (658, 0.23985961),
 (353, 0.23848091),
 (100, 0.22831631),
 (25, 0.22110189),
 (164, 0.22003724),
 (560, 0.21809895),
 (607, 0.21809895),
 (661, 0.21699187),
 (453, 0.21452421),
 (433, 0.21380045),
 (29, 0.21338686),
 (31, 0.21338686),
 (3414, 0.2116355),
 (3445, 0.2116355),
 (3451, 0.2116355),
 (3491, 0.2116355),
 (3505, 0.2116355),
 (3534, 0.2116355),
 (3556, 0.2116355),
 (3563, 0.2116355),
 (3568, 0.2116355),
 (3574, 0.2116355),
 (3728, 0.2116355),
 (3752, 0.2116355),
 (3845, 0.2116355),
 (3883, 0.2116355),
 (3897, 0.2116355),
 (4328, 0.2116355),
 (4415, 0.2116355),
 (4476, 0.2116355),
 (4489, 0.2116355),
 

In [218]:
desc[10]

'bluecore   jobs  data scientist   apply online all jobs  data scientist data scientist new york  united states  data science descriptionwe re looking for a highly motivated data scientist to join bluecore at the intersection of two teams  data science and marketing  your job would be to mine  explore  analyze  and distill into insights terabytes of behavioral data covering user behavior across 100s of website as well as their engagement with digital and email marketing campaigns  we re looking for someone with a methodological approach that respects the integrity of the data  is meticulous in their approach  and can be their own devils advocate  your findings will be used to create marketing and sales content that will have a direct impact on increasing bluecores visibility  generating sales leads  closing deals  and ensuring retention requirements bs in technical fields such as engineering  computer science  mathematics  physics  economics  statistics  business 1 3 years of analytics

In [135]:
desc[1]

'data scientist for startup   averity view all jobs visit our website data scientist for startup averity   new york  ny we are a rapidly growing startup in the marketing tech world looking for a data scientist to join our newly created data science team  what s the job we can promise you that your work with us will never be boring  as a data scientist in our data science team  you will be using scientific techniques  such as machine learning  predictive modeling  and data mining to analyze user behavior  you will report your finding to management and ultimately  your work will impact our company s future success who are we we are a marketing tech start up based in new york city  we are backed by major venture firms and rapidly growing  with office across the country  our newly created data science team works out of our new york city office located in the flatiron area of manhattan compensation 100 000    135 000full benefitswhat skills do you needmachine learning techniquesknowledge of

In [212]:
text_blobs = [desc[1], desc[229], desc[10], desc[534]]

In [205]:
len(text_blobs)

3

In [211]:
count = 0
for j in coll.find():
    print(count, j['url'])
    count += 1
    if count > 563:
        break

0 http://www.indeed.com/rc/clk?jk=3a736307116dd50a&fccid=d724b507863bb5b1
1 http://www.indeed.com/rc/clk?jk=82385b59d57c2166&fccid=39407476605079e0
2 http://www.indeed.com/rc/clk?jk=ca5a2c136eaccc1d&fccid=181ae1a6390e21b5
3 http://www.indeed.com/rc/clk?jk=4121a8a5ec79dbfc&fccid=f65aedcd2de292bf
4 http://www.indeed.com/rc/clk?jk=cde086027099b03b&fccid=4e9c4ad9d973d979
5 http://www.indeed.com/rc/clk?jk=ba072e94de1a5dba&fccid=e127f4594cdf24f4
6 http://www.indeed.com/rc/clk?jk=762dcebab0a10de3&fccid=61bb2b2cdd4507ae
7 http://www.indeed.com/rc/clk?jk=38761288da163e7f&fccid=ad9a355de43f962a
8 http://www.indeed.com/rc/clk?jk=1be48ee4132670bf&fccid=be3b11aa573faee7
9 http://www.indeed.com/rc/clk?jk=c9961fe7f2f0bdcc&fccid=b445cea762cc7d0d
10 http://www.indeed.com/rc/clk?jk=ad9f0dbd2739c2d8&fccid=e032d49a262c01c8
11 http://www.indeed.com/rc/clk?jk=251ce788e98a5403&fccid=9b4cf72eb101bebf
12 http://www.indeed.com/rc/clk?jk=eba548b4f6efc67b&fccid=c9760fb6d1cce259
13 http://www.indeed.com/rc/clk?jk=

In [99]:
import webarticle2text

In [219]:
len(new_list)

910

In [213]:
# Create some test text blobs to compare pairwise
#text_blobs = ['startup']
# Get matrix of counts
test_vecs = tfidf_vectorizer.transform(text_blobs).transpose()
# Convert to gensim corpus
test_corpus = matutils.Sparse2Corpus(test_vecs)
# TFIDF transformation
test_tfidf = tfidf[test_corpus]
# LSI transformation
test_lsi = lsi[test_tfidf]

In [214]:
# Index our test text blobs
test_index = similarities.MatrixSimilarity(test_lsi)



In [215]:
text_blobs[1][:70]

'data scientist   well funded startup job in new york  ny   jobspring p'

In [216]:
# For each test text blob that we're looking at
for i, sims in enumerate(test_index):
    # We get a list of similarities to all indexed text blobs
    # Print the text blob we're currently examining
    print ("Similarities to {}:".format(text_blobs[i][:70]))
    # Print the similarities of the current blob to all others with labels
    sims_with_labels = [(score, text_blobs[j][:70]) for j, score in enumerate(sims)]
    # Sort the results by decreasing similarity and print them out
    sorted_sims_with_labels = sorted(sims_with_labels, reverse=True)
    print (sorted_sims_with_labels)
    print ('\n')

Similarities to data scientist for startup   averity view all jobs visit our website d:
[(0.99999976, 'data scientist for startup   averity view all jobs visit our website d'), (0.794743, 'senior data scientist  client facing    averity view all jobs visit ou'), (0.18097329, 'bluecore   jobs  data scientist   apply online all jobs  data scientis'), (0.082620971, 'data scientist   well funded startup job in new york  ny   jobspring p')]


Similarities to data scientist   well funded startup job in new york  ny   jobspring p:
[(0.99999958, 'data scientist   well funded startup job in new york  ny   jobspring p'), (0.082620971, 'data scientist for startup   averity view all jobs visit our website d'), (0.056406535, 'senior data scientist  client facing    averity view all jobs visit ou'), (-0.016373284, 'bluecore   jobs  data scientist   apply online all jobs  data scientis')]


Similarities to bluecore   jobs  data scientist   apply online all jobs  data scientis:
[(1.0, 'bluecore   jobs