In [54]:
from bs4 import BeautifulSoup  # For HTML parsing
import re  # Regular expressions
from time import sleep  # To prevent overwhelming the server between connections
import requests
import pickle
from pymongo import MongoClient, errors
import numpy as np

In [55]:
try:
    client = MongoClient()
    print ("Connected successfully!!!")
except pymongo.errors.ConnectionFailure:
    print ("Could not connect to MongoDB: %s" % e)
client

Connected successfully!!!


MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [56]:
db = client.jobs_database

In [57]:
coll = db.job_collection_all

In [58]:
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [76]:
for job in coll.find():
    print(job['title'])
    break

Echoing Green - Jobs: Fundraising Intern - Apply online


In [59]:
desc = []
for job in coll.find():
    desc.append(job['description'])

In [60]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(analyzer='word',
                                  ngram_range=(1, 8), stop_words='english',
                                  token_pattern='\\b[a-z][a-z]+\\b')
count_vectorizer.fit(desc)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 8), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [61]:
ng_vecs = count_vectorizer.transform(desc).transpose()
ng_vecs.shape

(1327288, 6641)

In [62]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(ng_vecs)

In [63]:
for item in count_vectorizer.vocabulary_.items():
    print(item)
    break

('employment authorization program need special assistance', 389012)


In [69]:
count = 0
for item in corpus:
    count += 1
count

6641

In [64]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

## Topic modeling

#### LDA
At this point we can simply plow ahead in creating an LDA model. It requires our corpus of word counts, mapping of row ids to words, and the number of topics (3).

In [73]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus, id2word=id2word, num_topics=5, passes=10)

In [77]:
lda.print_topics(num_words=7, num_topics=5)

[(0,
  '0.002*job + 0.001*data + 0.001*experience + 0.001*information + 0.001*research + 0.001*roche + 0.001*share'),
 (1,
  '0.005*data + 0.002*analysis + 0.001*job + 0.001*memorial + 0.001*analytics + 0.001*business + 0.001*advanced'),
 (2,
  '0.002*password + 0.002*data + 0.001*email + 0.001*job + 0.001*apply + 0.001*ihs + 0.001*new'),
 (3,
  '0.002*data + 0.001*glassdoor + 0.001*job + 0.001*splunk + 0.001*work + 0.001*learning + 0.001*jobs'),
 (4,
  '0.003*language + 0.002*linguist + 0.002*sign + 0.001*sign language + 0.001*research + 0.001*jobs + 0.001*list')]

#### ------------------------------------------------------------

## TFIDF
to include weights in words based on their frequency.

this is directly proportional to term count within documents

inverse proportional to term count over all docs

In [83]:
# Create a TFIDF transformer from our word counts (equivalent to "fit" in sklearn)
tfidf = models.TfidfModel(corpus)

In [84]:
# Create a TFIDF vector for all documents from the original corpus ("transform" in sklearn)
tfidf_corpus = tfidf[corpus]

In [94]:
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary.from_corpus(tfidf_corpus,
                                    id2word=dict((id, word) for word, id in count_vectorizer.vocabulary_.items()))

In [None]:
# Build an LSI space from the input TFIDF matrix, mapping of row id to word, and num_topics
# num_topics is the number of dimensions to reduce to after the SVD
# Analagous to "fit" in sklearn, it primes an LSI space
lsi = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)