In [301]:
import pandas as pd
import matplotlib.pyplot as plt
from markdownify import markdownify
import re
import math
import requests
import bs4
from tqdm import trange

In [304]:
 def scrape_indeed_job_ids(query):
    """
    scrape job_ids

    args:
        driver: selenium web driver

    returns:
        array of dictionary objects containing job details
    """
    
    query = query.replace(' ','+')
    
    pc_re = re.compile('\s+Page (?P<current_job_no>\d+) of (?P<job_count>\d+) jobs')

    job_ids = []
    
    page = 0
    page_count = 1

    while page < page_count:
        if page == 0:
            resp = requests.get(f'https://www.indeed.co.uk/jobs?q={query}&limit=50')
            bs = bs4.BeautifulSoup(resp.text, 'html.parser')
            result = pc_re.match(bs.select('#searchCountPages')[0].text)
            
            if result:
                job_count = result.group('job_count')
                page_count = math.ceil(int(job_count)/50)
        else:
            resp = requests.get(f'https://www.indeed.co.uk/jobs?q=data+scientist&limit=50&start={page*50}')
            bs = bs4.BeautifulSoup(resp.text, 'html.parser')
        
        
        job_ids += [div['id'][div['id'].find('_')+1:] for div in bs.select('div.row.result')]
        
        page+=1

    return job_ids

def scrape_indeed_job_details(job_id):
    resp = requests.get(f'https://www.indeed.co.uk/viewjob?jk={job_id}&from=vjs&vjs=1')
    
    job_json = resp.json()

    job = {}

    job['job_id'] = job_id
    job['title'] = job_json['jobTitle']
    job['hiring_organization'] = job_json['sicm']['cmN']
    job['location'] = job_json['jobLocation']
    try:
        job['salary_expectation_average'] = job_json['sEx']['sAvg']
    except:
        job['salary_expectation_average'] = None

    try:
        job['salary_expectation_range'] = job_json['sEx']['sRg']
    except:
        job['salary_expectation_range'] = None
        
    try:
        job['salary_expectation_per'] = job_json['sEx']['sT']
    except:
        job['salary_expectation_per'] = None        
        
    try:
        job['job_type'] = job_json['jts']
    except:
        job['job_type'] = None
    
    resp = requests.get(f'https://www.indeed.co.uk/viewjob?jk={job_id}')
    
    bs = bs4.BeautifulSoup(resp.text, 'html.parser')
    div = bs.find("div", {"id": "jobDescriptionText"})
    job['job_description_html'] = div.encode_contents()
    job['job_description'] = div.text
        
    return job

In [305]:
job_details = []

job_ids = scrape_indeed_job_ids('Data Scientist')

for i in trange(len(job_ids)):
    job_details.append(scrape_indeed_job_details(job_ids[i]))

100%|██████████| 1076/1076 [08:53<00:00,  2.02it/s]


In [309]:
df_job_details = pd.DataFrame(job_details)

In [318]:
df_job_details.groupby('job_id').count()

Unnamed: 0_level_0,title,hiring_organization,location,salary_expectation_average,salary_expectation_range,salary_expectation_per,job_type,job_description_html,job_description
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
009c658cb617c1b8,1,1,1,0,0,0,0,1,1
01079205bf3bbba3,1,1,1,0,0,0,0,1,1
01766661a9ea1372,1,1,1,1,1,1,1,1,1
01c9be92ec76c8d7,1,1,1,0,0,0,0,1,1
01e7cf0455c161d7,1,1,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...
ff2c3d057c283fd2,1,1,1,1,1,1,1,1,1
ff2f2aabc2af35a3,1,1,1,0,0,0,0,1,1
ff903f80efaf4da7,1,1,1,1,1,1,1,1,1
ffdad69b595560bd,1,1,1,1,1,1,0,1,1


In [321]:
len(df_job_details.drop_duplicates())

799