In [1]:
import pandas as pd
import numpy as np
import requests
import json
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep

In [2]:
data = pd.read_csv('data/asx_listed_cleaned.csv', index_col=0).sort_values(by='Market Cap', ascending=False).reset_index(drop=True)

In [3]:
data.head()

Unnamed: 0,Code,Company,Sector,Market Cap
0,csl,csl ltd,health care,140501000000
1,cba,commonwealth bank of australia,financials,110976000000
2,bhp,bhp group ltd,materials,95298300000
3,wbc,westpac banking corporation,financials,58798200000
4,nab,national australia bank ltd,financials,50611200000


In [4]:
companies = ['csl', 'cba', 'bhp', 'wow', 'tls']
company_mask = data['Code'].isin(companies)

In [5]:
data_work = data[company_mask]
data_work

Unnamed: 0,Code,Company,Sector,Market Cap
0,csl,csl ltd,health care,140501000000
1,cba,commonwealth bank of australia,financials,110976000000
2,bhp,bhp group ltd,materials,95298300000
6,wow,woolworths group ltd,consumer staples,45155400000
11,tls,telstra corporation ltd,other,36274600000


In [6]:
def check_null_rows(df):
    return df[df.isnull().any(axis=1)]

# Build a Scraper custom class

In [7]:
class BaseScraper(object):
    """
    The base scraper class, which stores the data and the URL for child scraper classes.
    
    """
    def __init__(self, raw_url):
        self.raw_url = raw_url
        self.data = []
        
    def save_data(self, path_name):
        """
        Save all data to a csv file, and returns the data as a Pandas DataFrame.
        
        ------------
        Input(s):
        path_name (str): the named path to store the csv file. E.g.: '/path1/path2/company.csv'
        
        Output(s):
        df (pd.DataFrame): the scraped data in Pandas DataFrame format.
        Also saves data to the path specified.
        
        """
        df = pd.DataFrame(self.data)
        df.to_csv(path_name, index=False, encoding='utf-8')
        
        return df


class LinkedInJobScraper(BaseScraper):
    """
    A custom LinkedIn Job Scraper object.
    
    ------------
    Initialisation parameters:
    total_jobs (int): the total number of jobs seen in the search.
    raw_url (str): the raw url with style and ?start={} removed, for scraping. For example: 
        https://au.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=&location=Australia&locationId=&geoId=101452733&f_TPR=&f_C=2848
    
    ------------
    Examples use of this object:
    
    - Without logging into LinkedIn, find jobs according to desired criteria on https://au.linkedin.com/jobs/search
    - On Google Chrome, open the developers tool, then navigate to the `Network` tab.
    - Scroll down on the jobs section, and as more jobs are loaded, look for an element appearing
      on the left-hand panel of the developers tool starting with `search?`. Click on that element.
    - Grab the Request URL on the right-hand panel of the developers tool. The URL looks like:
      https://au.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?start=25
    - Remove the `?start=25` at the end, and this will be the raw URL as input for the scraper.
    - The total jobs appeared at the beginning of the search page is also to be specified,
      to accurately calculate the number of loops to run during scrape().
    
    """
    def __init__(self, total_jobs, *args, **kwargs):
        super(LinkedInJobScraper, self).__init__(*args, **kwargs)
        self.total_jobs = total_jobs
     
    def scrape(self, debug=True, verbose=True):
        """
        Scrape the specified URL.
        
        ------------
        Input(s):
        debug (bool, default=True): for debugging only, and should be ignored.
        verbose (bool, default=True): whether to print progress for each individual job scraped.
        
        Output(s):
        None. Writes data into the `data` attribute of the class.
        
        """
        # Since each page contains 25 results, this is the total number of pages in the search
        num_pages_in_search = math.ceil(self.total_jobs / 25)
        
        # Calculate the list of starting positions for job search, to iterate
        iter_list = []
        count_page = 0
        while count_page < self.total_jobs:
            iter_list.append(count_page)
            count_page += 25
        
        # This logic ensures the calculation of the number of pages is correct
        if debug:
            assert num_pages_in_search == len(iter_list), \
            f'Mismatch between total pages in the search and total pages in iteration: \nnum_pages_in_search = {num_pages_in_search}, len(iter_list) = {len(iter_list)}'
        
        # Get the scraping URL from the raw URL
        url_scrape = "{}&start={}"
        
        # Initiate an empty job id list
        job_id_list = []
        
        # For each page of the search, scrape all job IDs and append to the list
        for i in iter_list:
            # Request the website to scrape
            res = requests.get(url_scrape.format(self.raw_url, i))

            # Parse the html of that site
            soup = BeautifulSoup(res.text, 'html.parser')

            # Find all jobs on the page, which are in 'li' tags
            jobs_on_this_page = soup.find_all('li')
            
            # For each jobs found, extract the job ID and append to job_id_list
            for j in range(len(jobs_on_this_page)):
                try:
                    job_id = jobs_on_this_page[j].find("div", {"class": "base-card"}).get('data-entity-urn').split(":")[3]
                    job_id_list.append(job_id)
                # If none found, continue search
                except AttributeError as none:
                    continue
        
        # Initiate a dictionary to store individual jobs scraped
        data_individual = {}
        
        # Declare a shortened URL for jobs, which only differs in the job ID
        url_job = 'https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}'

        # For each job ID, append it to the url_job string, scrape job information, and store
        for k in range(len(job_id_list)):
            if verbose:
                print(f'Scraping job {k+1} of {self.total_jobs}...')
            
            # request the URL
            resp = requests.get(url_job.format(job_id_list[k]))
            
            # Scrape job contents
            soup = BeautifulSoup(resp.text, 'html.parser')
            
            # Scrape company name
            try:
                data_individual["company"] = soup.find("div", {"class": "top-card-layout__card"}).find("a").find("img").get('alt')
            except:
                data_individual["company"] = None
            
            # Scrape job titles
            try:
                data_individual["job_title"] = soup.find("div", {"class":"top-card-layout__entity-info"}).find("a").text.strip()
            except:
                data_individual["job_title"] = None

            # Scrape seniority level
            try:
                data_individual["level"] = soup.find("ul", {"class": "description__job-criteria-list"}).find("li").text.replace("Seniority level","").strip()
            except:
                data_individual["level"] = None
            
            # Scrape job description
            try:
                data_individual["description"] = soup.find("div", {"class": "show-more-less-html__markup"}).text.strip()
            except:
                data_individual["description"] = None
            
            # Append the job to the stored data attribute
            self.data.append(data_individual)
            
            # Reset the individual data point to blank, ready for scraping the next job
            data_individual = {}
        
        if verbose:
            print('Done scraping all data.')
            

class SeekJobScraper(BaseScraper):
    
    def __init__(self, total_pages, *args, **kwargs):
        super(SeekJobScraper, self).__init__(*args, **kwargs)
        self.total_pages = total_pages
        
    def scrape(self, sleep_duration=3, verbose=True):
        """
        Scrape the specified URL.
        
        ------------
        Input(s):
        sleep_duration (int, default=3): sleep duration for each GET request.
        verbose (bool, default=True): whether to print progress for each individual job scraped.
        
        Output(s):
        None. Writes data into the `data` attribute of the class.
        
        """
        # Calculate the list of starting positions for job search, to iterate
        iter_list = [p+1 for p in range(self.total_pages)]
        
        # Get the scraping URL from the raw URL
        url_scrape = "{}&page={}"
        
        # Get the shortened job URL that only differs in the jobId at the end
        url_job = 'https://www.seek.com.au/job/{}'
    
        # For each page of the search, scrape all job details and append to data storage attribute
        for i in iter_list:
            # Request the website to scrape
            res = requests.get(url_scrape.format(self.raw_url, i))

            # Parse the html of that site, and parse the json object as dict
            soup = BeautifulSoup(res.text, 'html.parser')
            json_data = json.loads(soup.text)['data']
            num_jobs_on_this_page = len(json_data)

            # Initiate an individual data point to attach to
            data_individual = {}
            
            # For each job found, extract the job information
            for j in range(len(json_data)):
                try:
                    # Only get organic job listing
                    if json_data[j]['solMetadata']['jobAdType'] == 'ORGANIC':
                        if verbose:
                            print(f'Scraping job {j+1} of {num_jobs_on_this_page} on page {i}...')
                        
                        # Get company name
                        data_individual['company'] = json_data[j]['companyName']
                        
                        # Get the job title
                        data_individual['job_title'] = json_data[j]['title']
                        
                        # Get the job description by going into the job details page, using the shortened URL
                        job_id = json_data[j]['solMetadata']['jobId']
                        job_resp = requests.get(url_job.format(job_id))
                        
                        # Set sleep duration for each GET request, to allow for load time
                        if j != len(json_data) - 1:
                            sleep(sleep_duration)
                        
                        job_soup = BeautifulSoup(job_resp.text, 'html.parser')
                        try:
                            data_individual['description'] = job_soup.find('div', {'data-automation': 'jobAdDetails'}).find('div').text.strip()
                        except:
                            data_individual['description'] = None
                        
                        # Append the job details to the stored data attribute
                        self.data.append(data_individual)

                        # Reset the individual data point to blank, ready to scrape the next job
                        data_individual = {}
                
                # If any of above is not found, continue search
                except:
                    continue

        if verbose:
            print('Done scraping all data.')

# Scraping: BHP Ltd. on LinkedIn

In [82]:
bhp_scraper = LinkedInJobScraper(
    147,
    'https://au.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=&location=Australia&locationId=&geoId=101452733&f_TPR=&f_C=4509'
)

bhp_scraper.scrape()

Scraping job 1...
Scraping job 2...
Scraping job 3...
Scraping job 4...
Scraping job 5...
Scraping job 6...
Scraping job 7...
Scraping job 8...
Scraping job 9...
Scraping job 10...
Scraping job 11...
Scraping job 12...
Scraping job 13...
Scraping job 14...
Scraping job 15...
Scraping job 16...
Scraping job 17...
Scraping job 18...
Scraping job 19...
Scraping job 20...
Scraping job 21...
Scraping job 22...
Scraping job 23...
Scraping job 24...
Scraping job 25...
Scraping job 26...
Scraping job 27...
Scraping job 28...
Scraping job 29...
Scraping job 30...
Scraping job 31...
Scraping job 32...
Scraping job 33...
Scraping job 34...
Scraping job 35...
Scraping job 36...
Scraping job 37...
Scraping job 38...
Scraping job 39...
Scraping job 40...
Scraping job 41...
Scraping job 42...
Scraping job 43...
Scraping job 44...
Scraping job 45...
Scraping job 46...
Scraping job 47...
Scraping job 48...
Scraping job 49...
Scraping job 50...
Scraping job 51...
Scraping job 52...
Scraping job 53...
Sc

In [83]:
bhp_data = bhp_scraper.save_data('data/jobs_linkedin/bhp_jobs_linkedin.csv')

In [86]:
bhp_data.head()

Unnamed: 0,company,job_title,level,description
0,BHP,Lead Project Delivery | Perth |,Mid-Senior level,"About BHPAt BHP we support our people to grow,..."
1,BHP,Specialist Data Science | Value Engineering | ...,Entry level,"About BHPAt BHP we support our people to grow,..."
2,BHP,Principal Environment | Perth | Permanent,Director,"About BHPAt BHP we support our people to grow,..."
3,BHP,Environmental Specialist | Nickel West | Kwina...,Entry level,"About BHPAt BHP we support our people to grow,..."
4,BHP,Process Technician Entry | Nickel West | Kwinana,Entry level,"About BHPAt BHP we support our people to grow,..."


# Scraping: Telstra Ltd. on LinkedIn

In [95]:
tls_scraper = LinkedInJobScraper(
    total_jobs=80,
    raw_url='https://au.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=&location=Australia&locationId=&geoId=101452733&f_TPR=&f_C=89885306%2C1635%2C122653%2C14383808%2C14455912%2C26547568%2C1636%2C3330130'
)

tls_scraper.scrape()

tls_data = tls_scraper.save_data('data/jobs_linkedin/tls_jobs_linkedin.csv')

Scraping job 1 of 80...
Scraping job 2 of 80...
Scraping job 3 of 80...
Scraping job 4 of 80...
Scraping job 5 of 80...
Scraping job 6 of 80...
Scraping job 7 of 80...
Scraping job 8 of 80...
Scraping job 9 of 80...
Scraping job 10 of 80...
Scraping job 11 of 80...
Scraping job 12 of 80...
Scraping job 13 of 80...
Scraping job 14 of 80...
Scraping job 15 of 80...
Scraping job 16 of 80...
Scraping job 17 of 80...
Scraping job 18 of 80...
Scraping job 19 of 80...
Scraping job 20 of 80...
Scraping job 21 of 80...
Scraping job 22 of 80...
Scraping job 23 of 80...
Scraping job 24 of 80...
Scraping job 25 of 80...
Scraping job 26 of 80...
Scraping job 27 of 80...
Scraping job 28 of 80...
Scraping job 29 of 80...
Scraping job 30 of 80...
Scraping job 31 of 80...
Scraping job 32 of 80...
Scraping job 33 of 80...
Scraping job 34 of 80...
Scraping job 35 of 80...
Scraping job 36 of 80...
Scraping job 37 of 80...
Scraping job 38 of 80...
Scraping job 39 of 80...
Scraping job 40 of 80...
Scraping 

In [96]:
tls_data.head()

Unnamed: 0,company,job_title,level,description
0,Telstra Health,Product Manager,Entry level,Work options: HybridMedicalDirector is part of...
1,Telstra Health,UX Designer,Mid-Senior level,Work options: HybridThe UX Designer needs to u...
2,Telstra,Customer Service and Sales Consultant,Mid-Senior level,Customer Service & Sales Consultant At Telstra...
3,Telstra Health,Service Desk Specialist,Entry level,Work options: Work From AnywhereTelstra Health...
4,Telstra Health,Customer Success Manager,Mid-Senior level,The Customer success Managers are trusted advi...


# Scraping: Woolworths Group on LinkedIn

In [110]:
wow_scraper = LinkedInJobScraper(
    61,
    'https://au.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=&location=Australia&locationId=&geoId=101452733&f_TPR=&f_C=295257'
)

wow_scraper.scrape()

Scraping job 1 of 61...
Scraping job 2 of 61...
Scraping job 3 of 61...
Scraping job 4 of 61...
Scraping job 5 of 61...
Scraping job 6 of 61...
Scraping job 7 of 61...
Scraping job 8 of 61...
Scraping job 9 of 61...
Scraping job 10 of 61...
Scraping job 11 of 61...
Scraping job 12 of 61...
Scraping job 13 of 61...
Scraping job 14 of 61...
Scraping job 15 of 61...
Scraping job 16 of 61...
Scraping job 17 of 61...
Scraping job 18 of 61...
Scraping job 19 of 61...
Scraping job 20 of 61...
Scraping job 21 of 61...
Scraping job 22 of 61...
Scraping job 23 of 61...
Scraping job 24 of 61...
Scraping job 25 of 61...
Scraping job 26 of 61...
Scraping job 27 of 61...
Scraping job 28 of 61...
Scraping job 29 of 61...
Scraping job 30 of 61...
Scraping job 31 of 61...
Scraping job 32 of 61...
Scraping job 33 of 61...
Scraping job 34 of 61...
Scraping job 35 of 61...
Scraping job 36 of 61...
Scraping job 37 of 61...
Scraping job 38 of 61...
Scraping job 39 of 61...
Scraping job 40 of 61...
Scraping 

In [111]:
wow_data = wow_scraper.save_data('data/jobs_linkedin/wow_jobs_linkedin.csv')

# Full LinkedIn dataset

In [112]:
wow = pd.read_csv('data/jobs_linkedin/wow_jobs_linkedin.csv')
tls = pd.read_csv('data/jobs_linkedin/tls_jobs_linkedin.csv')
cba = pd.read_csv('data/jobs_linkedin/cba_jobs_linkedin.csv')
bhp = pd.read_csv('data/jobs_linkedin/bhp_jobs_linkedin.csv')
csl = pd.read_csv('data/jobs_linkedin/csl_jobs_linkedin.csv')

full_data = [wow, tls, cba, bhp, csl]

In [115]:
full_df = pd.concat(full_data, ignore_index=True)
full_df.head()

Unnamed: 0,company,job_title,level,description
0,Woolworths Group,Data Engineering Lead,Mid-Senior level,About Woolworths GroupWoolworths Group is a fo...
1,Woolworths Group,Account Executive - AGW Wholesale,Mid-Senior level,About Woolworths GroupWoolworths Group is a fo...
2,Woolworths Group,HVAC & R Technician,Mid-Senior level,About Woolworths GroupWoolworths Group is a fo...
3,Woolworths Group,Analytics Engineering Manager,Mid-Senior level,"Lead teams in the design, development, impleme..."
4,Woolworths Group,IT Senior Business Analyst - Payments,Mid-Senior level,"About UsFounded within the Woolworths group, W..."


In [116]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   company      601 non-null    object
 1   job_title    601 non-null    object
 2   level        601 non-null    object
 3   description  601 non-null    object
dtypes: object(4)
memory usage: 19.2+ KB


In [121]:
full_df[full_df.isnull().any(axis=1)]

Unnamed: 0,company,job_title,level,description
214,,,,
215,,,,
216,,,,
241,,,,
285,,,,
286,,,,
291,,,,
294,,,,
297,,,,
345,,,,


In [122]:
full_df.dropna(inplace=True)

In [124]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 601 entries, 0 to 611
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   company      601 non-null    object
 1   job_title    601 non-null    object
 2   level        601 non-null    object
 3   description  601 non-null    object
dtypes: object(4)
memory usage: 23.5+ KB


In [125]:
full_df.to_csv('data/jobs_linkedin/all_jobs_linkedin.csv', index=False, encoding='utf-8')

# Seek Job Scraper

In [203]:
## Test code

# url3 = 'https://www.seek.com.au/api/chalice-search/v4/search?siteKey=AU-Main&sourcesystem=houston&userqueryid=13bc85bd693e885e2fae2bdbb068900b-7439281&userid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&usersessionid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&eventCaptureSessionId=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&where=All+Australia&seekSelectAllPages=true&companyname=Commonwealth+Bank+of+Australia&include=seodata&locale=en-AU&solId=8e8f007e-f45c-4c00-913d-7c862f50c2cb&page=1'
# response = requests.get(url3)
# soup = BeautifulSoup(response.text, 'html.parser')
# site_json = json.loads(soup.text)['data']

In [212]:
## Test code

# site_json[0]

## -------------------------------------------
## Important information:
## - data is in json['data']
## - 'jobAdType' in 'solMetadata' should be 'ORGANIC'
## - Get 'jobId' from 'solMetadata'

{'advertiser': {'id': '36143057',
  'description': 'Commonwealth Bank - Business & Private Banking'},
 'area': 'CBD, Inner West & Eastern Suburbs',
 'areaId': 5027,
 'areaWhereValue': 'Sydney CBD, Inner West & Eastern Suburbs Sydney NSW',
 'automaticInclusion': False,
 'branding': {'id': '8631e925-21b9-f44d-c462-d6f47f9a37dc.1',
  'assets': {'logo': {'strategies': {'jdpLogo': 'https://bx-branding-gateway.cloud.seek.com.au/8631e925-21b9-f44d-c462-d6f47f9a37dc.1/jdpLogo',
     'serpLogo': 'https://bx-branding-gateway.cloud.seek.com.au/8631e925-21b9-f44d-c462-d6f47f9a37dc.1/serpLogo'}}}},
 'bulletPoints': ['Flexible working arrangements',
  'Real career opportunities',
  "Work for Australia's largest bank"],
 'classification': {'id': '1204',
  'description': 'Call Centre & Customer Service'},
 'companyName': 'Commonwealth Bank of Australia',
 'companyProfileStructuredDataId': 2034,
 'displayStyle': {'search': 'A'},
 'displayType': 'standout',
 'listingDateDisplay': '2d ago',
 'location': 

In [215]:
## Test code

# test_data = {}
# test_url_job = 'https://www.seek.com.au/job/67176650'

# job_resp = requests.get(test_url_job)
# job_soup = BeautifulSoup(job_resp.text, 'html.parser')

In [216]:
## Test code

# try:
#     test_data['description'] = job_soup.find('div', {'data-automation': 'jobAdDetails'}).find('div').text.strip()
# except:
#     test_data['description'] = None

# test_data

{'description': "Work from home options availableStructured opportunity to developWork for Australia's largest bank Do work that matters The Business Banking Contact Centre supports a range of business customer's financial needs to move their business from today into tomorrow. Our Business Banking team provide invaluable support to our frontline Relationship Managers as the first point of contact to our business and private banking customers.  See yourself in our team  You'll be joining the Business Banking team - a hardworking bunch who support our online and phone business banking activities. Customer experience will be at the centre of everything you do. As a Business Banking Associate you will receive incoming calls and provide a premium level of technical support over the phone to assist our clients with their Business Banking and their merchant facilities.  What we are looking for  Previous customer service experience and exceptional customer service skills is essentialFinance ex

# Scraping: Commonwealth Bank jobs on Seek

In [221]:
cba_seek_scraper = SeekJobScraper(
    total_pages=10,
    raw_url='https://www.seek.com.au/api/chalice-search/v4/search?siteKey=AU-Main&sourcesystem=houston&userqueryid=13bc85bd693e885e2fae2bdbb068900b-7439281&userid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&usersessionid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&eventCaptureSessionId=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&where=All+Australia&seekSelectAllPages=true&companyname=Commonwealth+Bank+of+Australia&include=seodata&locale=en-AU&solId=8e8f007e-f45c-4c00-913d-7c862f50c2cb'
)

cba_seek_scraper.scrape()

cba_seek_jobs = cba_seek_scraper.save_data('data/jobs_seek/cba_jobs_seek.csv')

Scraping job 1 of 20 on page 1...
Scraping job 2 of 20 on page 1...
Scraping job 3 of 20 on page 1...
Scraping job 4 of 20 on page 1...
Scraping job 5 of 20 on page 1...
Scraping job 6 of 20 on page 1...
Scraping job 7 of 20 on page 1...
Scraping job 8 of 20 on page 1...
Scraping job 9 of 20 on page 1...
Scraping job 10 of 20 on page 1...
Scraping job 11 of 20 on page 1...
Scraping job 12 of 20 on page 1...
Scraping job 13 of 20 on page 1...
Scraping job 14 of 20 on page 1...
Scraping job 15 of 20 on page 1...
Scraping job 16 of 20 on page 1...
Scraping job 17 of 20 on page 1...
Scraping job 18 of 20 on page 1...
Scraping job 19 of 20 on page 1...
Scraping job 20 of 20 on page 1...
Scraping job 1 of 20 on page 2...
Scraping job 2 of 20 on page 2...
Scraping job 3 of 20 on page 2...
Scraping job 4 of 20 on page 2...
Scraping job 5 of 20 on page 2...
Scraping job 6 of 20 on page 2...
Scraping job 7 of 20 on page 2...
Scraping job 8 of 20 on page 2...
Scraping job 9 of 20 on page 2...
Scr

In [225]:
check_null_rows(cba_seek_jobs)

Unnamed: 0,company,job_title,description


# Scraping: CSL jobs on Seek

In [9]:
csl_seek_scraper = SeekJobScraper(
    total_pages=2,
    raw_url='https://www.seek.com.au/api/chalice-search/v4/search?siteKey=AU-Main&sourcesystem=houston&userqueryid=1b2af659e84347999573e079fb6fbfbc-4136558&userid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&usersessionid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&eventCaptureSessionId=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&where=All+Australia&seekSelectAllPages=true&companyname=CSL+Limited&include=seodata&locale=en-AU&solId=8e8f007e-f45c-4c00-913d-7c862f50c2cb'
)

csl_seek_scraper.scrape()

csl_seek_jobs = csl_seek_scraper.save_data('data/jobs_seek/csl_jobs_seek.csv')

Scraping job 1 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 2 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 3 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 4 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 5 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 6 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 7 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 8 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 9 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 10 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 11 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 12 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 13 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 14 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 15 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 16 of 20 on page 1...
Sleeping for 3 seconds...
Scraping job 17 of 20 on page 1..

In [11]:
check_null_rows(csl_seek_jobs)

Unnamed: 0,company,job_title,description


# Scraping: BHP jobs on Seek

In [12]:
bhp_seek_scraper = SeekJobScraper(
    total_pages=8,
    raw_url='https://www.seek.com.au/api/chalice-search/v4/search?siteKey=AU-Main&sourcesystem=houston&userqueryid=f16954fddb5e149bc48c68471d640531-4750155&userid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&usersessionid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&eventCaptureSessionId=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&where=All+Australia&seekSelectAllPages=true&companyname=BHP&include=seodata&locale=en-AU&solId=8e8f007e-f45c-4c00-913d-7c862f50c2cb'
)

bhp_seek_scraper.scrape()

bhp_seek_jobs = bhp_seek_scraper.save_data('data/jobs_seek/bhp_jobs_seek.csv')

Scraping job 1 of 20 on page 1...
Scraping job 2 of 20 on page 1...
Scraping job 3 of 20 on page 1...
Scraping job 4 of 20 on page 1...
Scraping job 5 of 20 on page 1...
Scraping job 6 of 20 on page 1...
Scraping job 7 of 20 on page 1...
Scraping job 8 of 20 on page 1...
Scraping job 9 of 20 on page 1...
Scraping job 10 of 20 on page 1...
Scraping job 11 of 20 on page 1...
Scraping job 12 of 20 on page 1...
Scraping job 13 of 20 on page 1...
Scraping job 14 of 20 on page 1...
Scraping job 15 of 20 on page 1...
Scraping job 16 of 20 on page 1...
Scraping job 17 of 20 on page 1...
Scraping job 18 of 20 on page 1...
Scraping job 19 of 20 on page 1...
Scraping job 20 of 20 on page 1...
Scraping job 1 of 20 on page 2...
Scraping job 2 of 20 on page 2...
Scraping job 3 of 20 on page 2...
Scraping job 4 of 20 on page 2...
Scraping job 5 of 20 on page 2...
Scraping job 6 of 20 on page 2...
Scraping job 7 of 20 on page 2...
Scraping job 8 of 20 on page 2...
Scraping job 9 of 20 on page 2...
Scr

In [13]:
check_null_rows(bhp_seek_jobs)

Unnamed: 0,company,job_title,description


# Scraping: Woolworths Group jobs on Seek

In [15]:
wow_seek_scraper = SeekJobScraper(
    total_pages=18,
    raw_url='https://www.seek.com.au/api/chalice-search/v4/search?siteKey=AU-Main&sourcesystem=houston&userqueryid=374a27e499beb4e0f37790206a2bc966-5035864&userid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&usersessionid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&eventCaptureSessionId=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&where=All+Australia&seekSelectAllPages=true&companyname=Woolworths+Group&include=seodata&locale=en-AU&solId=8e8f007e-f45c-4c00-913d-7c862f50c2cb'
)

wow_seek_scraper.scrape(sleep_duration=2)

wow_seek_jobs = wow_seek_scraper.save_data('data/jobs_seek/wow_jobs_seek.csv')

Scraping job 1 of 20 on page 1...
Scraping job 2 of 20 on page 1...
Scraping job 3 of 20 on page 1...
Scraping job 4 of 20 on page 1...
Scraping job 5 of 20 on page 1...
Scraping job 6 of 20 on page 1...
Scraping job 7 of 20 on page 1...
Scraping job 8 of 20 on page 1...
Scraping job 9 of 20 on page 1...
Scraping job 10 of 20 on page 1...
Scraping job 11 of 20 on page 1...
Scraping job 12 of 20 on page 1...
Scraping job 13 of 20 on page 1...
Scraping job 14 of 20 on page 1...
Scraping job 15 of 20 on page 1...
Scraping job 16 of 20 on page 1...
Scraping job 17 of 20 on page 1...
Scraping job 18 of 20 on page 1...
Scraping job 19 of 20 on page 1...
Scraping job 20 of 20 on page 1...
Scraping job 1 of 20 on page 2...
Scraping job 2 of 20 on page 2...
Scraping job 3 of 20 on page 2...
Scraping job 4 of 20 on page 2...
Scraping job 5 of 20 on page 2...
Scraping job 6 of 20 on page 2...
Scraping job 7 of 20 on page 2...
Scraping job 8 of 20 on page 2...
Scraping job 9 of 20 on page 2...
Scr

Scraping job 17 of 20 on page 12...
Scraping job 18 of 20 on page 12...
Scraping job 19 of 20 on page 12...
Scraping job 20 of 20 on page 12...
Scraping job 1 of 20 on page 13...
Scraping job 2 of 20 on page 13...
Scraping job 3 of 20 on page 13...
Scraping job 4 of 20 on page 13...
Scraping job 5 of 20 on page 13...
Scraping job 6 of 20 on page 13...
Scraping job 7 of 20 on page 13...
Scraping job 8 of 20 on page 13...
Scraping job 9 of 20 on page 13...
Scraping job 10 of 20 on page 13...
Scraping job 11 of 20 on page 13...
Scraping job 12 of 20 on page 13...
Scraping job 13 of 20 on page 13...
Scraping job 14 of 20 on page 13...
Scraping job 15 of 20 on page 13...
Scraping job 16 of 20 on page 13...
Scraping job 17 of 20 on page 13...
Scraping job 18 of 20 on page 13...
Scraping job 19 of 20 on page 13...
Scraping job 20 of 20 on page 13...
Scraping job 1 of 20 on page 14...
Scraping job 2 of 20 on page 14...
Scraping job 3 of 20 on page 14...
Scraping job 4 of 20 on page 14...
Scrap

In [16]:
check_null_rows(wow_seek_jobs)

Unnamed: 0,company,job_title,description


# Scraping Telstra jobs on Seek

In [17]:
tls_seek_scraper = SeekJobScraper(
    total_pages=7,
    raw_url='https://www.seek.com.au/api/chalice-search/v4/search?siteKey=AU-Main&sourcesystem=houston&userqueryid=8a9f4c84f660d4709c8a71bcfa927813-5698512&userid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&usersessionid=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&eventCaptureSessionId=fb3867e8-2415-49bb-963a-a9cbdd9d7a23&where=All+Australia&seekSelectAllPages=true&companyname=Telstra&include=seodata&locale=en-AU&solId=8e8f007e-f45c-4c00-913d-7c862f50c2cb'
)

tls_seek_scraper.scrape(sleep_duration=2)

tls_seek_jobs = tls_seek_scraper.save_data('data/jobs_seek/tls_jobs_seek.csv')

Scraping job 1 of 20 on page 1...
Scraping job 2 of 20 on page 1...
Scraping job 3 of 20 on page 1...
Scraping job 4 of 20 on page 1...
Scraping job 5 of 20 on page 1...
Scraping job 6 of 20 on page 1...
Scraping job 7 of 20 on page 1...
Scraping job 8 of 20 on page 1...
Scraping job 9 of 20 on page 1...
Scraping job 10 of 20 on page 1...
Scraping job 11 of 20 on page 1...
Scraping job 12 of 20 on page 1...
Scraping job 13 of 20 on page 1...
Scraping job 14 of 20 on page 1...
Scraping job 15 of 20 on page 1...
Scraping job 16 of 20 on page 1...
Scraping job 17 of 20 on page 1...
Scraping job 18 of 20 on page 1...
Scraping job 19 of 20 on page 1...
Scraping job 20 of 20 on page 1...
Scraping job 1 of 20 on page 2...
Scraping job 2 of 20 on page 2...
Scraping job 3 of 20 on page 2...
Scraping job 4 of 20 on page 2...
Scraping job 5 of 20 on page 2...
Scraping job 6 of 20 on page 2...
Scraping job 7 of 20 on page 2...
Scraping job 8 of 20 on page 2...
Scraping job 9 of 20 on page 2...
Scr

# Full Seek Dataset

In [19]:
cba_seek_jobs = pd.read_csv('data/jobs_seek/cba_jobs_seek.csv')
csl_seek_jobs = pd.read_csv('data/jobs_seek/csl_jobs_seek.csv')
bhp_seek_jobs = pd.read_csv('data/jobs_seek/bhp_jobs_seek.csv')
wow_seek_jobs = pd.read_csv('data/jobs_seek/wow_jobs_seek.csv')
tls_seek_jobs = pd.read_csv('data/jobs_seek/tls_jobs_seek.csv')

full_seek_data = [
    cba_seek_jobs,
    csl_seek_jobs,
    bhp_seek_jobs,
    wow_seek_jobs,
    tls_seek_jobs
]

full_seek_data_df = pd.concat(full_seek_data, ignore_index=True)
full_seek_data_df.head()

Unnamed: 0,company,job_title,description
0,Commonwealth Bank of Australia,Business Banking Customer Service Representative,Work from home options availableStructured opp...
1,Commonwealth Bank of Australia,HR Advisor,See yourself in our team: People are a key pa...
2,Commonwealth Bank of Australia,Customer Service Representative - CommSec,You are available to work Monday to Friday bet...
3,Commonwealth Bank of Australia,Customer Service Representative - CommSec,You are available to work Monday to Friday bet...
4,Commonwealth Bank of Australia,Customer Service Specialist,About us: The Customer Service Remediation (...


In [20]:
full_seek_data_df.to_csv('data/jobs_seek/all_jobs_seek.csv', index=False, encoding='utf-8')

In [21]:
full_seek_data_df.shape

(857, 3)

In [22]:
check_null_rows(full_seek_data_df)

Unnamed: 0,company,job_title,description
