In [339]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import time
import pandas as pd

## Scraping INDEED to get job listings.

In this notebook I am scraping the Indeed website for Data Scientist, Data Analyst and Data Engineer jobs in key cities from US, UK, SG, HK and AUS. I have split the scraper up into 3 functions (extractor, params, and scraper) which returns a pandas dataframe object for each country being queried.

---

The main function is the __scraper__ function. It goes to the urls defined from the __params__ functions and uses the __extractor__ function to retrieve information in the url to store into a dataframe.

### This is the extractor function. 
This function will be called from the soup that is created from querying the html and returns a dictionary with the scraped information and the city.

In [489]:
def extractor(listings, scraped_data, city):
    """ extractor function. Takes a page of listings and makes lists of:
        title, company, location, description, experience, salary, and city.
        listings come from the soup object, scraped data is the dictionary defined in the scraper function, and
        city is a string of the city that comes from the params function."""
    
    
    def get_title(listing):
        """ gets job title"""
        try:
            return listing.find('a', {'data-tn-element':'jobTitle'}).get('title').strip()
        except:
            print(str())
            return 'NA'
        
    def get_company(listing):
        """ gets company"""
        try:
            return listing.find('span', {'class':'company'}).text.strip()
        except:
            return 'NA'
        
    def get_location(listing):
        """ gets location"""
        try:
            return listing.find('span', {'class': 'location'}).text.strip()
        except:
            return 'NA'
        
    def get_description(listing):
        """ description """
        try:
            return listing.find('span', {'class': 'summary'}).text.strip()
        except:
            return 'NA'
    
    def get_desired_exp(listing):
        """ desired exp """
        try:
            return listing.find('span', {'class':'experienceList'}).text.strip()
        except:
            return 'NA'
        
    def get_salary(listing):
        """ salary if available """
        try:
            return listing.find('span', {'class': 'no-wrap'}).text.strip()
        except:
            return 'NA'
        
    def get_reviewcount(listing):
        """ number of reviews of company"""
        try:
            result = listing.find('a', {'data-tn-element':'reviewStars'})
            return result.find('span', {'class','slNoUnderline'}).text.strip()
        except:
            return 'NA'
    
    def get_stars(listing):
        """ stars of company in terms of pixelwidth"""
        try:
            result = listing.find('span', {'class':'rating'}).get('style').strip()
            return result.replace('width:', '').replace('px', '')
        except:
            return 'NA'
    
    
    
        
    for listing in listings:
        scraped_data['title'].append(get_title(listing))
        scraped_data['company'].append(get_company(listing))
        scraped_data['location'].append(get_location(listing))
        scraped_data['description'].append(get_description(listing))
        scraped_data['experience'].append(get_desired_exp(listing))
        scraped_data['salary'].append(get_salary(listing))
        scraped_data['reviews'].append(get_reviewcount(listing))
        scraped_data['stars'].append(get_stars(listing))
        scraped_data['city'].append(city)
    
    return scraped_data


        
    

### The params function. 
The params function gets the URL of the site for me to scrape, along with the cities that are defined in the scraper function below. It returns a tuple with the URL to be scraped, along with the city associated with it.



In [490]:
def params(country, dictofcities):
    """ returns the url and city in a tuple based on name of city. 
        url contains Data Scientist, Data Analyst and Data Engineer with the keywords Machine Learning
        , Artificial Intellignece and Analytics"""

    assert country in ['SG','UK','US','HK','AU','CA'] # throws exception error if invalid country
    
    if country in 'SG':
        base_url = "https://www.indeed.com.sg/jobs"
    elif country in 'UK':
        base_url = "https://www.indeed.co.uk/jobs"
    elif country in 'US':
        base_url = "https://www.indeed.com/jobs"
    elif country in 'HK':
        base_url = "https://www.indeed.hk/jobs"
    elif country in 'AU':
        base_url = "https://au.indeed.com/jobs"
    elif country in 'CA':
        base_url = "https://ca.indeed.com/jobs"      
    
    
    cities = dictofcities[country]   # Retrieves the city list. Look at scraper function to see what the dict is.
    urls = []
    citylist = []      #appends the city name
    for city in cities:
        urls.append(base_url + "?q=Data+Scientist+(machine+or+Learning+or+Analytics+or+Artificial+or+Intelligence)" \
        + "&l={}&limit=50".format('+'.join(city.split())))
        citylist.append(city)
        urls.append(base_url + "?q=Data+Analyst+(machine+or+Learning+or+Analytics+or+Artificial+or+Intelligence)" \
        + "&l={}&limit=50".format('+'.join(city.split())))
        citylist.append(city)
        urls.append(base_url + "?q=Data+Engineer+(machine+or+Learning+or+Analytics+or+Artificial+or+Intelligence)" \
        + "&l={}&limit=50".format('+'.join(city.split())))
        citylist.append(city)
        
        # '+'.join(city.split()) had to be used to split the city name up into a chunk to be inserted into the URL.
        # Eg: "New York" --> "New+York"
        
    return zip(urls, citylist)
    

Let me test the params function here. seems to work

In [492]:
dictofcities = {'SG':['Singapore'],
                   'UK':['London','Manchester','Birmingham','Liverpool','Milton Keynes','Bristol','Cambridge','Oxford'],
                   'US':['Houston','San Francisco','Mountain View','Palo Alto','Los Angeles','New York', 'San Jose',
                        'Boston','Chicago','Seattle','Austin','Dallas','San Diego','Denver','Portland','St Louis',
                        'Philadelphia','Cincinnati'],
                   'CA':['Toronto', 'Montreal','Vancouver'],
                   'AU':['Sydney','Melbourne','Brisbane','Perth'],
                   'HK':['Hong Kong']}


test = params('UK',dictofcities)
for t in test:
    print t[1]

London
London
London
Manchester
Manchester
Manchester
Birmingham
Birmingham
Birmingham
Liverpool
Liverpool
Liverpool
Milton Keynes
Milton Keynes
Milton Keynes
Bristol
Bristol
Bristol
Cambridge
Cambridge
Cambridge
Oxford
Oxford
Oxford


### Defining the scraper function.

This function basically calls the params and extractor function to get a dataframe of job listings for the country being called in the argument.

In [493]:
def scraper(country):
    """ scraper function. parameters have to be defined beforehand. Returns a dataframe that contains job
    listings. """
    
    
    print "Current system time: {}".format(time.ctime())   # prints current time
    start_time = time.time()   # starts timer
    
    scraped_data = {'title': [],           # This dict will eventually be converted into a pandas dataframe.
                   'company': [],       
                   'location': [],
                   'description': [],
                   'experience': [],
                   'salary': [],
                   'reviews': [],
                   'stars': [],
                   'city': []}
    
    dictofcities = {'SG':['Singapore'],     # dict of cities to search
                   'UK':['London','Manchester','Birmingham','Liverpool','Milton Keynes','Bristol','Cambridge','Oxford',
                        'Birmingham','Leeds','Brighton','Southampton'],
                   'US':['Houston','San Francisco','Mountain View','Palo Alto','Los Angeles','New York', 'San Jose',
                        'Boston','Chicago','Seattle','Austin','Dallas','San Diego','Denver','Portland','St Louis',
                        'Philadelphia','Cincinnati','Atlanta','Berkeley','Detroit','Miami',''],
                   'CA':['Toronto', 'Montreal','Vancouver','Ottawa'],
                   'AU':['Sydney','Melbourne','Brisbane','Perth'],
                   'HK':['Hong Kong']}
    

    links = params(country, dictofcities)      # gets the URLs for each site to scrape
    print "Scraping Indeed for country: {}".format(country)
    for link in links:
        for i in range(0,1000,50):            # iterates through all the pages up to a max of 1000 results
            url = link[0] + "&start={}".format(i)
    
            html = requests.get(url)
            assert html.status_code == requests.codes.ok
    
            soup = BeautifulSoup(html.text, "lxml")
            listings = soup.find_all('div', {'data-tn-component':'organicJob'}) #organicJob would not take the sponsored content
        
            scraped_data = extractor(listings, scraped_data, link[1])  # look at extractor function. 
            
    df = pd.DataFrame(scraped_data)      # makes a dataframe for the country
    dfdropped = df.drop_duplicates()     # drops duplicates
    run_time = time.time() - start_time  # stops timer
    print "Time elapsed in seconds: {}".format(run_time)
    print "Scraper finished at: {}".format(time.ctime())
    return dfdropped
    


In [494]:
dfsg = scraper('SG')

Current system time: Thu Oct 26 14:57:13 2017
Scraping Indeed for country: SG
Time elapsed in seconds: 61.4437291622
Scraper finished at: Thu Oct 26 14:58:14 2017


In [495]:
dfsg.head()

Unnamed: 0,city,company,description,experience,location,reviews,salary,stars,title
0,Singapore,Libbler,"Good understanding of investment and quant, ma...",,Singapore,,,,Research Data Manager
1,Singapore,Procter & Gamble,"Scope includes acquiring, cleaning, formatting...",,Singapore,"3,713 reviews",,52.2,Research & Development - Data Transformation S...
2,Singapore,EkkBaz.com,"Selecting features, building and optimizing cl...",,Ang Mo Kio,,"$1,000 - $2,000 a month",,Data Scientist Intern
3,Singapore,IBM,Familiar with at least one Machine Learning li...,,Singapore,"18,106 reviews",,51.0,Research Data Scientist (IBM Singapore Lab)
4,Singapore,GIC Investment,Data & Analytics Department. Experience buildi...,,Tanjong Pagar,,,,"AVP/VP, Machine Learning Engineer"


In [496]:
dfuk = scraper('UK')

Current system time: Thu Oct 26 14:59:49 2017
Scraping Indeed for country: UK
Time elapsed in seconds: 2340.02391911
Scraper finished at: Thu Oct 26 15:38:49 2017


In [499]:
dfuk.shape

(6687, 9)

In [500]:
dfsg.to_pickle("dfsg.pkl")

In [501]:
dfuk.to_pickle("dfuk.pkl")

In [502]:
dfus = scraper('US')

Current system time: Thu Oct 26 15:42:00 2017
Scraping Indeed for country: US
Time elapsed in seconds: 3492.91553903
Scraper finished at: Thu Oct 26 16:40:13 2017


In [503]:
dfus.to_pickle("dfus.pkl")

In [504]:
dfca = scraper('CA')

Current system time: Thu Oct 26 16:46:28 2017
Scraping Indeed for country: CA
Time elapsed in seconds: 501.877671957
Scraper finished at: Thu Oct 26 16:54:50 2017


In [505]:
dfca.to_pickle("dfca.pkl")

In [506]:
dfau = scraper('AU')

Current system time: Thu Oct 26 17:03:53 2017
Scraping Indeed for country: AU
Time elapsed in seconds: 770.558179855
Scraper finished at: Thu Oct 26 17:16:44 2017


In [507]:
dfau.to_pickle("dfau.pkl")

In [508]:
dfhk = scraper('HK')

Current system time: Thu Oct 26 17:58:53 2017
Scraping Indeed for country: HK
Time elapsed in seconds: 58.5559232235
Scraper finished at: Thu Oct 26 17:59:51 2017


In [509]:
dfhk.to_pickle("dfhk.pkl")

In [511]:
dfhk['salary'].value_counts()

NA                            782
$50,000 - $70,000 a month       4
$20,000 - $40,000 a month       4
$90,000 - $120,000 a month      2
$15,000 - $25,000 a month       2
$40,000 - $45,000 a month       2
$50,000 - $55,000 a month       1
$12,000 - $15,000 a month       1
$15,000 - $30,000 a month       1
Name: salary, dtype: int64