In [1]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

Setup the query and url

In [4]:
def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://ca.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url

In [5]:
url = get_url('data analyst', 'canada')
print(url)

https://ca.indeed.com/jobs?q=data+analyst&l=canada


Extract the html data

In [6]:
response = requests.get(url)

In [7]:
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
jobcards = soup.find('div', id="mosaic-provider-jobcards")

In [9]:
cards = jobcards.find_all('div', "job_seen_beacon")

In [10]:
len(cards)

15

Prototype the model with a single record

In [11]:
card = cards[1]

In [12]:
print(card.prettify)

<bound method Tag.prettify of <div class="job_seen_beacon"><div class="fe_logo"><img alt="PC Financial logo" class="feLogoImg desktop" src="https://d2q79iu7y748jz.cloudfront.net/s/_squarelogo/256x256/1e8a56c1d51f5b3b4d3ba8671085b3e9"/></div><table cellpadding="0" cellspacing="0" class="jobCard_mainContent big6_visualChanges" role="presentation"><tbody><tr><td class="resultContent"><div class="heading4 color-text-primary singleLineTitle tapItem-gutter"><h2 class="jobTitle jobTitle-color-purple jobTitle-newJob"><div class="new topLeft holisticNewBlue desktop"><span class="label">new</span></div><span title="Data Analyst, Data Enablement">Data Analyst, Data Enablement</span></h2></div><div class="heading6 company_location tapItem-gutter companyInfo"><span class="companyName"><a class="turnstileLink companyOverviewLink" data-tn-element="companyName" href="/cmp/PC-Financial" rel="noopener" target="_blank">PC Financial</a></span><span class="ratingsDisplay withRatingLink"><a class="ratingLin

In [13]:
job_title = card.h2.getText()
print(job_title)

newData Analyst, Data Enablement


In [14]:
companyName = card.find('span', 'companyName').text.strip()
print(companyName)

PC Financial


In [15]:
companyLocation = card.find('div', 'companyLocation').text.strip()
print(companyLocation)

Toronto, ON


In [16]:
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
print(post_date)
print(today)

Posted8 days ago
2022-04-06


In [17]:
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
    salary = salary_tag.text.strip()
else:
    salary = ''
print(salary_tag)

None


In [19]:
#job_url = 'https://www.indeed.ca' + card.h2.a.get('href')
#print(job_url)

In [21]:
#summary = card.find('div', 'summary').text.strip().replace('\n', ' ')

In [23]:
record = (job_title, job_location, post_date, today, summary, salary, job_url)

NameError: name 'job_location' is not defined

In [None]:
record

Generalize the model with a function

In [None]:
def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    job_url = 'https://www.indeed.ca' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record

In [None]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

Get the next page

In [None]:
while True:
    try:
        url = 'https://www.indeed.ca' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')

    for card in cards:
        record = get_record(card)
        records.append(record)

In [None]:
Putting it all together

In [None]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.ca/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    job_url = 'https://www.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record


def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.indeed.ca' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
        writer.writerows(records)

In [None]:
# run the main program
main('data analyst', 'canada')

Source
https://github.com/israel-dryer/Indeed-Job-Scraper