In [4]:
import csv
from bs4 import BeautifulSoup
import pandas as pd
import requests
import datetime as dt

## Navigate To Website

In [1]:
# get to indeed.com

def get_soup(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'DNT': '1'
    }

    page = requests.get(url, headers=headers)

    # create soup record
    soup = BeautifulSoup(page.content, 'html.parser')
    # records = soup.find_all('td', class_='resultContent')
    return soup


## Create Function to Extract Records

In [11]:
def get_record(record):

    title = record.h2.text
    company = record.find('span', class_='companyName').text
    location = record.find('div', class_='companyLocation').text

    # extract pay information if it exists
    try:
        pay = record.find('div', class_='metadata salary-snippet-container').text
    except AttributeError:
        pay = ''

    # extract rating informaiton if it extists
    try:
        rating = record.find('span', {'aria-hidden': 'true'}).text
    except AttributeError:
        rating = ''

    result = (title, company, location, pay, rating)

    return result


## Collect Records From All Pages

In [8]:
def scrape(url):
    
    soup = get_soup(url)
    data = []
    while True:
        try:
            url = 'https://ca.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        records = soup.find_all('td', class_='resultContent')

        for card in records:
            record = get_record(card)
            data.append(record)

    return pd.DataFrame(data)



In [20]:
df1 = scrape('https://ca.indeed.com/jobs?q=geologist&l&vjk=7dc5bc08dd5403df')
df1['job'] = 'Geologist'
df2 = scrape('https://ca.indeed.com/jobs?q=data%20analyst&l&vjk=d5c29d10430b1346')
df2['job'] = 'Data Analyst'

data = df1.append(df2)

In [21]:
data

Unnamed: 0,0,1,2,3,4,job
0,newData Analyst - Geophysics,Sander Geophysics Limited,"Ottawa, ON",,3.8,Geologist
1,newCore Logging Geologist (Remote Camp),Workforce Inc.,"Timmins, ON",$34–$40 an hour,4.6,Geologist
2,mine geologist,SLR Consulting (Canada) Ltd.,"Toronto, ON","$120,000–$145,000 a year",3.7,Geologist
3,newProject Geologist,Fladgate Exploration Consulting Corporation,"Thunder Bay, ON",,,Geologist
4,Geological Technician/Jr Geologist,Healthcare Systems R & A Inc.,"Montréal, QC",$24–$35 an hour,,Geologist
...,...,...,...,...,...,...
955,"Intermediate Business Analyst, HR SW & Reporting",Procom,"Toronto, ON",,3.6,Data Analyst
956,"newBusiness Analyst (Trading, Capital Markets,...",Teamrecruiter.com,"Toronto, ON",,,Data Analyst
957,Infrastructure Business Analyst,TES - The Employment Solution,"Regina, SK",,3.6,Data Analyst
958,newSenior Business Analyst,Procom,"Toronto, ON",,3.6,Data Analyst


## Create Pandas Data.Frame

In [24]:
# covert to DataFrame
df = data

# set columns
df.columns = ["title", "company", "location", "pay", "rating", "job"]

# file name
fname = "job-data" + dt.datetime.today().strftime('%Y-%m-%d') + '.csv'

# save to .csv
df.to_csv(fname)

# print first 5 rows of data
df.head()

Unnamed: 0,title,company,location,pay,rating,job
0,newData Analyst - Geophysics,Sander Geophysics Limited,"Ottawa, ON",,3.8,Geologist
1,newCore Logging Geologist (Remote Camp),Workforce Inc.,"Timmins, ON",$34–$40 an hour,4.6,Geologist
2,mine geologist,SLR Consulting (Canada) Ltd.,"Toronto, ON","$120,000–$145,000 a year",3.7,Geologist
3,newProject Geologist,Fladgate Exploration Consulting Corporation,"Thunder Bay, ON",,,Geologist
4,Geological Technician/Jr Geologist,Healthcare Systems R & A Inc.,"Montréal, QC",$24–$35 an hour,,Geologist
