In [43]:
import requests
from bs4 import BeautifulSoup
import re
import time
from fake_useragent import UserAgent
import random 
from collections import defaultdict
import numpy as np
import pickle
import pandas as pd
import math
import json,pymongo
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.utils.extmath import randomized_svd
from sklearn.metrics.pairwise import cosine_similarity
from gensim.summarization import summarize
from sklearn.decomposition import LatentDirichletAllocation as LDA
import gensim

from nltk.util import ngrams
from nltk.tokenize import word_tokenize, wordpunct_tokenize, WhitespaceTokenizer
import PyPDF2
%matplotlib inline

Scraping job urls from Glassdoor.com

In [23]:
def parse_job_url(url):

    try:    
        ua = UserAgent()
        user_agent = {'User-agent':ua.random}   
        response = requests.get(url,headers = user_agent,timeout=10)
    except Exception as e:
        print(e.message)

    page = response.text
    soup = BeautifulSoup(page,"lxml")

    job_url_list = list()
    soup_object = soup.find_all('li',class_ = 'jl')

    for i in soup_object:
        job_url_list.append(i.div.a['href'])
    
    return job_url_list

# Getting data scientist job urls within San Francisco

main_url_list = [
'https://www.glassdoor.com/Job/san-jose-data-scientist-jobs-SRCH_IL.0,8_IC1147436_KO9,23_IP',
'https://www.glassdoor.com/Job/los-angeles-data-scientist-jobs-SRCH_IL.0,11_IC1146821_KO12,26_IP',
'https://www.glassdoor.com/Job/boston-data-scientist-jobs-SRCH_IL.0,6_IC1154532_KO7,21_IP'
    
]

job_url_list = list()

for main_url in main_url_list:
    
    for n in range(1,32):
        print(n,end='\r')
        url = main_url+str(n)+'.htm'
        job_url_list.extend(parse_job_url(url))
        time.sleep(6*random.random()+2)
        if len(job_url_list) == 0: 
            print('Did not extract url')
            break

31

Saving webscraped information to file

In [25]:
with open ('/Users/nealcheng/Desktop/data/data_scientist_urls2', 'wb') as f:
    pickle.dump(list(set(job_url_list)), f)

In [44]:
with open ('/Users/nealcheng/Desktop/data/data_scientist_urls', 'rb') as f:
    new_job_url_list = [('http://www.glassdoor.com'+ x) for x in pickle.load(f)]

In [45]:
text_list = [np.nan for x in range(len(new_job_url_list))]
company_rating_list = [np.nan for x in range(len(new_job_url_list))]
job_title_list = [np.nan for x in range(len(new_job_url_list))]
city_list = [np.nan for x in range(len(new_job_url_list))]
company_list = [np.nan for x in range(len(new_job_url_list))]
employer_id_list = [np.nan for x in range(len(new_job_url_list))]

Scraping job information from Glassdoor.com

In [None]:
failed_extraction_list = list()

for n in range(len(new_job_url_list)):    

    url = new_job_url_list[n]
    print(n,end='\r')
    try:    
        ua = UserAgent()
        user_agent = {'User-agent':ua.random}   
        response = requests.get(url,headers = user_agent,timeout=10)
        time.sleep(3*random.random()+5)
        
    except: # Exception as e:
        #print(e.message)
        failed_extraction_list.append(n)
        time.sleep(20*random.random())
        #break
        
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    try: 
        text_body = soup.find('div',id = 'JobDescContainer').div.text
    except:
        text_body = np.nan
        failed_extraction_list.append(n)
    
    try:
        company_rating = float(soup.find('span',class_ = 'compactStars margRtSm').text.strip())
    except:
        company_rating = np.nan
        
    try:
        job_title = soup.find('h2').text    
    except:
        job_title = np.nan
    
    try: 
        city = soup.find('span',class_='subtle ib').text
    except:
        city = np.nan
        
    try:
        company = soup.find('span',class_='ib padRtSm').text.strip()
    except:
        company = np.nan
    
    try:    
        employer_id = soup.find('div',id = 'EmpBasicInfo')['data-emp-id']
    except:
        employer_id = np.nan
        
    text_list[n] = text_body
    company_rating_list[n] = company_rating
    job_title_list[n] = job_title
    city_list[n] = city
    company_list[n] = company
    employer_id_list[n] = employer_id

In [None]:
df = pd.DataFrame(data = {'Company':company_list,'EmployerID':employer_id_list,
                    'JobTitle':job_title_list,'City':city_list,
                     'CompanyRating':company_rating_list,'Url':new_job_url_list,
                    'Text':text_list                    
                    })
df['City'] = df['City'].str.replace('–','').str.strip()
df.drop_duplicates(inplace=True)
df.dropna(subset=['Text'],inplace=True)
df.head()

Saving dataframe

In [10]:

with open ('/Users/nealcheng/Desktop/data/data_scientist_jobs', 'wb') as f:
    pickle.dump(df, f)
    
f.close()

Opening dataframe

In [11]:
with open ('/Users/nealcheng/Desktop/data/data_scientist_jobs', 'rb') as f:
    df2 = pd.read_pickle(f)
    df2.drop_duplicates(subset= ['Text'],inplace=True)
    df2.drop_duplicates(subset= ['Url'],inplace=True)
f.close()

Creating Tfidf Vectors for the job postings

In [12]:
tfidf = TfidfVectorizer(stop_words='english',sublinear_tf=True)
tfidf_job_desc_data = tfidf.fit_transform(df2['Text']).todense()
tfidf_job_desc_data -= np.mean(tfidf_job_desc_data)

Creating pca object with 500 components

In [None]:
pca = PCA(n_components=500)
tfidf_job_desc_pca = pca.fit_transform(tfidf_job_desc_data)
np.cumsum(pca.explained_variance_ratio_)

Writing a function to open a sample resume I had scraped from Indeed

In [15]:
# Using sample Resume
def open_resume(file):

    pdf_file = open(file, 'rb')
    read_pdf = PyPDF2.PdfFileReader(pdf_file)
    number_of_pages = read_pdf.getNumPages()
    page_content = str()
    for n in range(number_of_pages):
        page = read_pdf.getPage(n)
        page_content += page.extractText()
    return page_content

page_content = open_resume('/Users/nealcheng/Desktop/data/Young-Lee.pdf')

Creating the TFIDF vectors for the resume

In [59]:
tfidf_resume_data = tfidf.transform([page_content])
tfidf_resume_data = tfidf_resume_data.todense()-np.mean(tfidf_resume_data)
norm_dist_resume_pca = pca.transform(tfidf_resume_data)

cosine_list = list()

for text in df2['Text']:
    
    tfidf = TfidfVectorizer(stop_words='english',sublinear_tf=True)
    temp_tfidf_data = tfidf.transform([text])
    temp_tfidf_data = temp_tfidf_data.todense() - np.mean(temp_tfidf_data)
    
    temp_pca_data = pca.transform(temp_tfidf_data)
    cosine_list.append(np.sum(cosine_similarity(temp_pca_data,norm_dist_resume_pca)))
    


In [60]:
df3 = pd.DataFrame({'Cosine':cosine_list,'Url':df2['Url'],'Text':df2['Text']}).sort_values('Cosine',ascending=False).reset_index()
df3.head()


Unnamed: 0,index,Cosine,Text,Url
0,227,0.222575,"At eHealth, we are passionate about solving ou...",http://www.glassdoor.com/partner/jobListing.ht...
1,818,0.184928,About Optimizely: Our mission is to turn data ...,http://www.glassdoor.com/partner/jobListing.ht...
2,114,0.18174,Data Scientist Tasks include Developing “cor...,http://www.glassdoor.com/partner/jobListing.ht...
3,655,0.181644,Why is This a Great Opportunity? Our client ...,http://www.glassdoor.com/partner/jobListing.ht...
4,827,0.171596,Company DescriptionClient of Roljobs Technolog...,http://www.glassdoor.com/partner/jobListing.ht...


Best matching job position

In [70]:
df3['Text'][0]

" Business Intelligence Senior Analyst (TEMPORARY) ESSENTIAL JOB FUNCTIONS:Using Qlikview to respond to BI requests from throughout the business by delivering reports and visualizations.Create, modify, and enhance Qlikview dashboardsDevelop automated reports which combine data from disparate sources to facilitate day-to-day business operationsSetup data Extract-Transform-Load (ETL) processes within the Business Intelligence tool to combine various data sourcesPresent, educate, and champion the use of business intelligence tools to gain business insights across all levels of the organization.Assist with miscellaneous reportsMINIMUM QUALIFICATIONS:Expertise using Qlikview, Oracle BI, and SQL Plus.4+ years of experience in a Business Intelligence Analyst role or equivalent talent with dataExpertise with Excel and SQL, to include developing and maintaining advanced queries, pivot tables, and data modelsExpertise developing advanced data extracts and data transformationsMust possess strong 

Resume text

In [71]:
page_content

"Young Lee\nData Scientist - Allstate\nDublin, CA\n-\nEmail me on Indeed: \nindeed.com/r/Young-Lee/109dcde6f5b08add\nData scientist with experience in property & casualty personal lines insurance and academic background in\nstatistics and economics.\nModeling\n-- Regularized GLMs (LASSO, Ridge, Elastic net), Gradient Boosting Machines, Generalized Additive Models\n-- Insurance loss modeling using Tweedie regressions\n-- Longitudinal survey data analysis\nComputing/Programming\n-- R, SQL, Linux, Python, Stata\nWilling to relocate to: San Francisco, CA - San Jose, CA - Menlo Park, CA\nAuthorized to work in the US for any employer\nWORK EXPERIENCE\nData Scientist\nAllstate\n \n-\n \nChicago, IL\n-\nAugust 2015 to Present\nBuilt and deployed XGBoost model to predict likelihood of a policy having an undisclosed driver,\npotentially saving company in excess of $3,200,000/year\nŁ Performed predictive modeling of auto insurance losses and life insurance mortality using elastic net\nregularized