# Job search and application

In [101]:
# import libraries

import numpy as np
import pandas as pd
import regex as re # to clean text
import os #why?
import time # to save csv to today's date
import requests
from bs4 import BeautifulSoup #to get data out of html or xml
from collections import Counter # to count keyword appearance

In [102]:
# read the text file. the urls have been copied using the "Copy All Urls" chrome extension
# while going through the saved searches results, with Cmd+Enter open the interesting jobs in a new tab 

with open('./urls.txt',"r") as urls:
    urls = urls.readlines() 

In [103]:
#agent to interact with web content

hdr = {'User-Agent': 'Mozilla/5.0'} 

In [104]:
# check the request is successful

res = []
for url in urls:
    res.append(requests.get(url,headers=hdr))

for i in range(len(res)):
    if(res[i].status_code) != 200:
        print(i,res[i].status_code)

# get the content of each url

soup = []
for i in range(len(res)):
    soup.append(BeautifulSoup(res[i].content, 'lxml'))
    # soup is a list of beautifulsoup objects

128 403
162 410


In [105]:
# append each job description to a list

jd = []
for i in range(len(soup)):
    
# VentureFizz
    if soup[i].find('div',{'class': 'views-field views-field-field-job-description'}):
        jd.append(soup[i].find('div',{'class': 'views-field views-field-field-job-description'}))
    
# Linkedin
    elif soup[i].find('div',{'class': 'description__text description__text--rich'}):
        jd.append(soup[i].find('div',{'class': 'description__text description__text--rich'}))
    
# indeed.com
    elif soup[i].find('div',{'class': 'jobsearch-JobComponent icl-u-xs-mt--sm'}):
        jd.append(soup[i].find('div',{'class': 'jobsearch-JobComponent icl-u-xs-mt--sm'}))

# Glassdoor?
    elif soup[i].find(name='div', attrs={'id':'JobDescriptionContainer','class': 'gdGrid tabSection p-std mt-0'}):
        jd.append(soup[i].find(name='div', attrs={'id':'JobDescriptionContainer','class': 'gdGrid tabSection p-std mt-0'})) #glassdoor

# dice.com 
    else: 
        jd.append(soup[i].find(name='div', attrs={'id':'jobdescSec','class': 'highlight-black'}))

## Improvement point: 
- skip problematic jobs/lines

In [106]:
# split the job descriptions into a list of words
# if the job description can't be read(empty) let me know which post ...
# ... it has probably expired and needs to be ommitted

job_desc = [] # list of all words in the job desc
for i in range(len(jd)):
    try:
        job_desc.append(re.findall(r'[A-z]+', jd[i].text))
    except AttributeError:
        print('error on {}'.format (i))
        job_desc.append(re.findall(r'[A-z]+', jd[i-1].text)) # temp fix: if there is an error append 
                                                             # the prior cleared post
                                                            # code will break if two consecutive posts are problematic
    except AttributeError:
        print('error on {}'.format (i))
        job_desc.append(re.findall(r'[A-z]+', jd[i-2].text))
    except AttributeError:
        print('error on {}'.format (i))
        job_desc.append(re.findall(r'[A-z]+', jd[i-3].text))
    except AttributeError:
        print('error on {}'.format (i))
        job_desc.append(re.findall(r'[A-z]+', jd[i-4].text))

error on 12
error on 26
error on 35
error on 67
error on 69
error on 74
error on 78
error on 108
error on 113
error on 115
error on 117
error on 124
error on 126
error on 128
error on 153
error on 155
error on 162


In [107]:
# small cap all keywords to be matched with the dictionary of search words

for i in range(len(job_desc)):
    job_desc[i] = [x.lower() for x in job_desc[i]]

In [108]:
#assign a weight for each keyword

keywords = {
            'cfa':10,'arabic':10,'french':10,'returnship':10,'re-ignite':10,'relaunch':10,'boston':10,
            'mba':7,'python':7,
            'fp':5,'analytics':5,'analysis':5,'analyze':5,'analyzes':5,'analytical':5,'analyzing':5,
            'tableau':5,'sql':5,'excel':5,
            'finance':3,'financial':3,'financials':3,
            'data':2,'dataset':2,'datasets':2,
            'predict':2, 'projections':2,'projections':2,'predictive':2, 'forecast':2, 'forecasting':2, 'forecasts':2,
            'valuation':2, 'valuations':2, 
            'model':2, 'models':2, 'modeling':2, 'decisions':2,
            'trend':1, 'trends':1, 'plan':1,'plans':1, 'planning':1, 'budget':1,
            'metric':1,'insight':1,'insights':1,
           }

In [109]:
# match the job description to the dictionary of keywords

keywords_lol = [] # list of (lists of) matching keywords of all jobs

for i in range(len(job_desc)):
    job_desc_keywords = [] # list of matching keywords for each job
    for word in job_desc[i]:
        if word in keywords.keys():
            job_desc_keywords.append(word)
    keywords_lol.append(job_desc_keywords)

In [110]:
# replace each keyword by its weight equivalent

scores_lol = [] # list of (lists of) job scores

for i in range(len(keywords_lol)):
    job_score = [] # score of each job
    for word in keywords_lol[i]:
        job_score.append(keywords[word])
    scores_lol.append(sum(job_score))

In [111]:
# count the times each keyword repeats in each job post
d = []
for i in range(len(keywords_lol)):
    cnt = Counter()
    for word in keywords_lol[i]:
        cnt[word] += 1
    d.append(cnt)
    #print(i,cnt)

In [112]:
# create the dataframe
jobs = pd.DataFrame(columns=['link', 'appeal'])

In [113]:
# clean up the urls
urls = [x.replace('\n',"") for x in urls]

In [114]:
# fill in the data frame
jobs = jobs.assign(appeal=scores_lol, link = urls).fillna(0)

In [115]:
# drop duplicate URLs
jobs = jobs.drop_duplicates(subset=['link','appeal'])

In [116]:
# create another column and mark all jobs as not applied to
jobs['application'] = 'not yet applied'

In [117]:
# do not abridge the urls
pd.set_option('display.max_colwidth', None)

In [118]:
# check the DF
jobs.head()

Unnamed: 0,link,appeal,application
0,"https://www.indeed.com/viewjob?jk=6c659eccabf4a45b&q=Python+Analytics+Data+Finance+MBA+excel+(analysis+or+Analyst+or+Analyses+or+Analytical+or+Analyzing+or+Dataset+or+Datasets+or+Financial+or+Financials+or+Decisions+or+Insight+or+Insights+or+Analytics+or+Analysis+or+Analyst+or+Analyses+or+Analytical+or+Anal&l=Brookline,+MA&tk=1e49ospml32vi802&from=ja&alid=5dcf23fc803d2451183c3f08&utm_campaign=job_alerts&utm_medium=email&utm_source=jobseeker_emails&rgtk=1e49ospml32vi802",197,not yet applied
1,https://www.indeed.com/viewjob?jk=011b15460b8771f7&q=Python+Analytics+Data+Finance+MBA+excel+(analysis+or+Analyst+or+Analyses+or+Analytical+or+Analyzing+or+Dataset+or+Datasets+or+Financial+or+Financials+or+Decisions+or+Insight+or+Insights+or+Analytics+or+Analysis+or+Analyst+or+Analyses+or+Analytical+or+Anal&l=United+States&tk=1e3npd4df36v0800&from=ja&alid=5dcf2719803d2451183c4152&utm_campaign=job_alerts&utm_medium=email&utm_source=jobseeker_emails&rgtk=1e3npd4df36v0800,101,not yet applied
2,https://www.indeed.com/viewjob?jk=f83badafea23ce6c&q=Python+Analytics+Data+Finance+MBA+excel+(analysis+or+Analyst+or+Analyses+or+Analytical+or+Analyzing+or+Dataset+or+Datasets+or+Financial+or+Financials+or+Decisions+or+Insight+or+Insights+or+Analytics+or+Analysis+or+Analyst+or+Analyses+or+Analytical+or+Anal&l=United+States&tk=1e4rrvhv02ta4800&from=ja&alid=5dcf2719803d2451183c4152&utm_campaign=job_alerts&utm_medium=email&utm_source=jobseeker_emails&rgtk=1e4rrvhv02ta4800,160,not yet applied
3,https://www.indeed.com/viewjob?jk=07f9fbc2899778d1&q=python+analytics+data+finance+SQL+(analysis+or+analyze+or+analyses+or+analytical+or+analyzing+or+dataset+or+datasets+or+financial+or+financials+or+decisions+or+insight+or+insights+or+analytics+or+analysis+or+analyze+or+analyses+or+analytical+or+analyzing+&l=Brookline&tk=1e4rs41pi381h801&from=ja&advn=1343250976351098&adid=315897298&sjdu=i6xVERweJM_pVUvgf-MzuVJvpP-MaR_hWGGPPJrzQ5gblBxFjctZ0k8ChKLFXuLpEeQtSDbhkItY20XhSzlNdtEibdyPxFKZ-0PGAEnx3WY&acatk=1e4sf7dpahdmg800&pub=e4ea14867e765046&utm_campaign=job_alerts&utm_medium=email&utm_source=jobseeker_emails,185,not yet applied
6,https://www.linkedin.com/jobs/view/1749285376/?eBP=NotAvailableFromVoyagerAPI&refId=49160274-dcd6-4cfe-bc95-303d1c4fee85&trk=d_flagship3_search_srp_jobs,76,not yet applied


In [119]:
# save df to a csv file
jobs.to_csv('./jobs_{}.csv'.format(time.strftime("%Y%m%d")))

In [120]:
# show top 5 jobs
jobs.sort_values(['application', 'appeal'], ascending=[True, False] ).head()

Unnamed: 0,link,appeal,application
36,https://www.linkedin.com/jobs/view/1810167392/?eBP=NotAvailableFromVoyagerAPI&recommendedFlavor=COMPANY_RECRUIT&refId=708128e9-d603-46a5-aaa9-6656059de09c&trk=d_flagship3_search_srp_jobs,314,not yet applied
95,https://www.indeed.com/viewjob?jk=f51398bd7dd0474c&q=Python+Analytics+Data+Finance+MBA+excel+(analysis+or+Analyst+or+Analyses+or+Analytical+or+Analyzing+or+Dataset+or+Datasets+or+Financial+or+Financials+or+Decisions+or+Insight+or+Insights+or+Analytics+or+Analysis+or+Analyst+or+Analyses+or+Analytical+or+Anal&l=United+States&tk=1e5dr7617283p806&from=ja&advn=782336216336969&adid=329327108&sjdu=7BeRdT5uW8zd9J94pO4lsGI0Ja_kBs24iFGcI7wf15q550Mg1xAZesiZ5AabnRzOHq-Q0lbAQs6iicjzkHTf5wfanGPXGgjv-Gh8FMmu3qcMJJq_WWRfP6XhTeSU8Q4g&acatk=1e5h2dgsb4sv7800&pub=e4ea14867e765046&utm_campaign=job_alerts&utm_medium=email&utm_source=jobseeker_emails,298,not yet applied
20,https://www.linkedin.com/jobs/view/1806883960/?eBP=NotAvailableFromVoyagerAPI&refId=8837b68a-ed40-4b8b-9ead-5c2b2596e8e0&trk=d_flagship3_search_srp_jobs,242,not yet applied
64,https://www.linkedin.com/jobs/view/1806883960/?eBP=CwEAAAFxRS9HLsz6PcFWwrY6Soxli4AEf9V3sdQPqrDT2jSdQfMmSGpUI3ulV6E3awuBwPtDhB-YJyS3Zqz7uKw76z3qVRudYgVcSYOUrb68_os8q_D7T6zl7rT3n3E5Ckw0ZOIA9a0JV52MuSXGulsdByssu7-D3i2NvyE69qJdjjV34d4uxRvLWxkAmULvRqwMmfL3xPBGKvSzUVTNWRuzsyffElxaM1AY-wWfLXhdf4zoPyHI0EShRs1HuB0TXConUNSzRFzIKy9RjxpM_uIrglKT0tDcrh0YvqhiTf-om2caux0wU6Tvw387vRg-TfiP9_Tlz-K1eEuZ2UB3frz9vHy3zQJoz4uXLtrP3b0xdahPbxWPTzMHMZAgp1v4uHL2LIyygyerfg16JaU&refId=e2667841-283d-4c9a-ba7b-afaa25c979d8&trk=d_flagship3_search_srp_jobs,242,not yet applied
73,https://www.linkedin.com/jobs/view/1806883960/?eBP=JOB_SEARCH_ORGANIC&refId=7f1de094-9b9d-47cf-a3c4-7257d67a64fb&trk=d_flagship3_search_srp_jobs,242,not yet applied


## Refresh point

In [172]:
# As you check the jobs, mark them as applied to ('yes') or do not qualify for ('x')

jobs.loc[[165,129,111,138,125,131,112,104,105,40,94,93,71,90,0,3,18,20,57,60,64,65,73,95],'application'] = 'yes'
jobs.loc[[99,168,153,152,134,140,173,35,141,176,171,167,166,169,108,174,162,175,147,126,155,154,113,158,149,96,137,135,103,106,133,122,118,121,77,70,62,14,6,88,84,72,74,89,87,91,82,78,83,1,2,11,12,13,15,19,21,24,26,27,28,31,36,48,61,66,67,68,69],'application'] = 'x'



In [173]:
# sort job by application status, then by score
top_job = jobs.sort_values(['application', 'appeal'], ascending=[True, False]).head(1)
top_job

Unnamed: 0,link,appeal,application
164,https://www.indeed.com/viewjob?jk=011b15460b8771f7&from=ja&exp=1&utm_campaign=job_alerts&utm_medium=email&utm_source=jobseeker_emails&rgtk=1e35ptlc33cr1805,101,not yet applied


In [174]:
# prior to applying, search the JD for required skills, experience, etc. # CMD + F
jd[top_job.index[0]].text.replace('\n'," ")
           # myjob = jobs.loc[[128]]
# jd[jobs.loc[[128]].index[0]].text

"This job has expired on IndeedReasons could include: the employer is not accepting applications, is not actively hiring, or is reviewing applicationsBusiness Analyst IAmazon.com Services LLC-Seattle, WA                 try {                     window.mosaic.onMosaicApiReady(function() {                         var zoneId = 'aboveExtractedJobDescription';                         var providers = window.mosaic.zonedProviders[zoneId];                          if (providers) {                             providers.filter(function(p) { return window.mosaic.lazyFns[p]; }).forEach(function(p) {                                 return window.mosaic.api.loadProvider(p);                             });                         }                     });                  } catch (e) {};                                  try {                     window.mosaic.onMosaicApiReady(function() {                         var zoneId = 'aboveFullJobDescription';                         var providers = window.m

## Improvement points:
- show a more user friendly listing (maybe as it shows in the website)
- extract the parts of the text where the words experience, require, etc. is mentioned
- extract the city and job title (all this in a DF, maybe)

In [79]:
# check for keywords in top job ... to customize R/CL
pd.DataFrame.from_dict(d[top_job.index[0]], orient='index').sort_values(0, ascending=False)

Unnamed: 0,0
financial,8
data,6
analytics,5
models,4
forecast,4
modeling,3
tableau,2
sql,2
excel,2
analytical,2


### back to refresh point