### Extracting Indeed Job Postings With Selenium

In [1]:
# install dependencies
# pip install webdriver-manager

In [2]:
# import modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import pandas as pd

In [3]:
def transform(): 
    '''Returns a list of job posting information, including job title, company,
        location, and salary.
    '''
    # extract job card elements
    job_cards_elements = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
    # gather title, company, location, and salary
    for job in job_cards_elements:
        title = job.find_element(By.TAG_NAME, "a").text
        company = job.find_element(By.CLASS_NAME, "companyName").text
        location = job.find_element(By.CLASS_NAME, "companyLocation").text

        # try except, since some postings do not have salary
        try:
            salary = job.find_element(By.CLASS_NAME, "attribute_snippet").text
        except:
            salary = ''
        
        # create dictionary for job attributes
        job_posting = {
            'title': title,
            'company': company,
            'location': location,
            'salary': salary
        }
        
        # append dictionary to list for each job posting
        job_posting_list.append(job_posting)
job_posting_list = []

In [4]:
# set desired number pages to loop through; choosing 65 to get a sample size
#    of 10% from ~10000 postings = 1000.
total_pages = 65

# loop through pages
for page in range(0,total_pages):
    # create new instance of Edge driver
    url = f"https://www.indeed.com/jobs?q=Data+Scientist&start={page}"
    driver = webdriver.Chrome()
    driver.get(url)
    transform()
    driver.quit()

In [6]:
# check number of posting extracted 
print(len(job_posting_list))

# convert to dataframe and look at first five rows
df_postings = pd.DataFrame(job_posting_list)
print(df_postings.head())

900
                                     title  \
0                      Lead Data Scientist   
1                Senior Predictive Modeler   
2                            Data Engineer   
3  Risk Intelligence - Data Analytics Lead   
4                           Data Scientist   

                                     company  \
0  Disney Media & Entertainment Distribution   
1                         Sitewise Analytics   
2                        Booz Allen Hamilton   
3                                   Citizens   
4                          Brightside Health   

                               location                      salary  
0                Santa Monica, CA 90401  $149,240 - $200,200 a year  
1                            Dallas, TX         From $90,000 a year  
2  Hybrid remote in Arlington, VA 22202   $73,100 - $166,000 a year  
3     Boston, MA 02112 \n(Central area)                   Full-time  
4                                Remote                   Full-time  


In [9]:
# convert to csv 
df_postings.to_csv('job_postings.csv')