# UK Data Science Job Analysis

## Objective

To analyse data scraped from Indeed.co.uk for jobs matching the query "Data Scientist". Aiming to answer these 5 questions:

1. Which are the most requested skills/technologies for a Data Scientist position?
2. What kind of salary/day rate can be expected?
3. Do roles tend to be permanent or contract/temporary roles

In [381]:
import pandas as pd
import matplotlib.pyplot as plt
from markdownify import markdownify
import re
import math
import requests
import bs4
from tqdm import trange

In [386]:
 def scrape_indeed_job_ids(query):
    """
    Run a job search query on the Indeed.co.uk website and for returning results.
    
    Iterate through the paginated results (50 at a time which was found to be the maximum) building an array of unique job_id to be returned.
    
    The job ids can be used in later calls to gather job details such as job title, expected salary, skills etc.

    args:
        query: the search query for jobs that you wish to run e.g. "Data Scientist"

    returns:
        array of str which are the job_ids
    """
    
    query = query.replace(' ','+')
    
    pc_re = re.compile('\s+Page (?P<current_job_no>\d+) of (?P<job_count>\d+) jobs')

    job_ids = []
    
    page = 0
    page_count = 1

    while page < page_count:
        if page == 0:
            resp = requests.get(f'https://www.indeed.co.uk/jobs?q={query}&limit=50')
            bs = bs4.BeautifulSoup(resp.text, 'html.parser')
            result = pc_re.match(bs.select('#searchCountPages')[0].text)
            
            if result:
                job_count = result.group('job_count')
                page_count = math.ceil(int(job_count)/50)
        else:
            resp = requests.get(f'https://www.indeed.co.uk/jobs?q=data+scientist&limit=50&start={page*50}')
            bs = bs4.BeautifulSoup(resp.text, 'html.parser')
        
        job_ids += [div['id'][div['id'].find('_')+1:] for div in bs.select('div.row.result')]
        
        page+=1

    return list(set(job_ids))

def scrape_indeed_job_details(job_id):
    """
    For the given job_id scrape the specific job details using the first json call.
    
    Next request the full page for the job_id to enable scraping of the job description plain text and html
    """
    resp = requests.get(f'https://www.indeed.co.uk/viewjob?jk={job_id}&from=vjs&vjs=1')
    
    job_json = resp.json()

    job = {}

    job['job_id'] = job_id
    job['json'] = str(job_json)
    job['title'] = job_json['jobTitle']
    job['hiring_organization'] = job_json['sicm']['cmN']
    job['location'] = job_json['jobLocation']
    try:
        job['salary_expectation_average'] = job_json['sEx']['sAvg']
    except:
        job['salary_expectation_average'] = None

    try:
        job['salary_expectation_range'] = job_json['sEx']['sRg']
    except:
        job['salary_expectation_range'] = None
        
    try:
        job['salary_expectation_per'] = job_json['sEx']['sT']
    except:
        job['salary_expectation_per'] = None        
        
    try:
        job['job_type'] = job_json['jts']
    except:
        job['job_type'] = None
        
    try:
        job['skills_de'] = job_json['dem']['de']
    except:
        job['skills_de'] = None
        
    resp = requests.get(f'https://www.indeed.co.uk/viewjob?jk={job_id}')
    
    bs = bs4.BeautifulSoup(resp.text, 'html.parser')
    div = bs.find("div", {"id": "jobDescriptionText"})
    job['job_description_html'] = div.encode_contents()
    job['job_description'] = div.text
        
    return job

In [None]:
job_details = []

job_ids = scrape_indeed_job_ids('Data Scientist')

for i in trange(len(job_ids)):
    job_details.append(scrape_indeed_job_details(job_ids[i]))

 53%|█████▎    | 432/819 [02:46<02:28,  2.60it/s]