# Web Scraping Linkedin Job Listings with Python

Reference : https://maoviola.medium.com/a-complete-guide-to-web-scraping-linkedin-job-postings-ad290fcaa97f

Here I reproduced Mao Viola's solution on web scraping Linkedin job Posting, with some adjustment and fixes.

Pre-requisite:
* Have python > 3.0 installed : https://www.python.org/downloads/windows/
* Ensure pip or anaconda is installed 
* Have jupyter notebook installed : https://jupyter.org/install (if using pip) or https://anaconda.org/anaconda/jupyter (if using anaconda)
* Have Selenium WebDriver installed : https://pypi.org/project/selenium/ (if using pip) or https://anaconda.org/conda-forge/selenium (if using anaconda)
* Have Pandas installed
* Download chrome webdriver : https://chromedriver.chromium.org/downloads (make sure it supports your Chrome version!)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
import time
import pandas as pd

## 1. Opening browser & scroll the job listing

In [None]:
# Define browser and action setup
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)

# Define URL
linkedin_url = "https://www.linkedin.com/jobs/search/?keywords=data%20scientist&location=Indonesia"

# Action Steps
driver.maximize_window()
driver.get(linkedin_url) # Open web page

# Determine how many jobs we want to scrape, and calculate how many time we need to scroll down
no_of_jobs = 300
# int(driver.find_element_by_css_selector("h1>span").get_attribute("innerText"))
n_scroll = int(no_of_jobs/25)+1
print(n_scroll)
i = 1
driver.execute_script("return document.body.scrollHeight") #scroll to top
while i <= n_scroll:
    driver.execute_script("return document.body.scrollHeight")
    time.sleep(2) # wair for 2 seconds
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #scroll to the bottom of page
    time.sleep(3)
    i = i + 1
    try:
        button = driver.find_element(By.XPATH,"/html/body/div[1]/div/main/section[2]/button")
        time.sleep(2)
        button.click()
        time.sleep(1)
        print("load more click")
    except:
        driver.execute_script("return document.body.scrollHeight")
        time.sleep(3)

print ("total jobs :")
jobs = driver.find_element(By.CLASS_NAME,"jobs-search__results-list").find_elements(By.TAG_NAME,'li') # return a list
print(len(jobs))


## 2. Get Main Attributes of each Job Listing

Important notes : 
* The HTML and CSS element path needs to be checked regularly, because it's possible that it will change in the future
* You can also group all the possibly changing web elements in one place
* This is not the most efficient code ever. It works but definitely needs improvement, and feel free to do so on your own.

In [None]:
job_id= []
job_title = []
company_name = []
location = []
date = []
job_link = []
for job in jobs:
    job_id0 = job.get_attribute('data-id')
    job_id.append(job_id0)

    job_title0 = job.find_element(By.CSS_SELECTOR,'h3').get_attribute('innerText')
    job_title.append(job_title0)

    company_name0 = job.find_element(By.CSS_SELECTOR,'h4').get_attribute('innerText')
    company_name.append(company_name0)

    location0 = job.find_element(By.CLASS_NAME,'job-search-card__location').get_attribute('innerText')
    location.append(location0)

    date0 = job.find_element(By.CSS_SELECTOR,'div>div>time').get_attribute('datetime')
    date.append(date0)

    job_link0 = job.find_element(By.CSS_SELECTOR,'a').get_attribute('href')
    job_link.append(job_link0)

In [None]:
jd = []
seniority = []
emp_type = []
job_func = []
industries = []
for item in range(len(jobs)):
    job_func0=[]
    industries0=[]
    # clicking job to view job details
    try:
        job_click_path = f'/html/body/div[1]/div/main/section[2]/ul/li[{item+1}]/a/div[1]/img'
        job_click = job.find_element(By.XPATH,job_click_path).click()
    except:
        job_click_path = f'/html/body/div[1]/div/main/section[2]/ul/li[{item+1}]/div/a'
    
    print(job_click_path)
    job_click = job.find_element(By.XPATH,job_click_path).click()
    time.sleep(3)
    try:
        jd_path = '/html/body/div[1]/div/section/div[2]/div/section[1]/div/div/section'
        jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')
    except:
        jd_path = '/html/body/div[1]/div/section/div[2]/div/section[2]/div/div/section/div'
    
    jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')
    is_benefit = True
    try:
        assert 'Base pay range' in jd0
    except:
        is_benefit = False
        
    if is_benefit==True :
        jd_path = '/html/body/div[1]/div/section/div[2]/div/section[2]/div/div/section/div'
        jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')
        jd.append(jd0)
        print(jd_path)
        print("appended")
        jd_path2 = '/html/body/div[1]/div/section/div[2]/div/section[2]/div'
    else:
        print(jd_path)
        jd.append(jd0)
        print("appended")
        jd_path2 = '/html/body/div[1]/div/section/div[2]/div/section[1]/div'

    try:
        seniority_path = jd_path2 + '/ul/li[1]/span'
        seniority0 = job.find_element(By.XPATH,seniority_path).get_attribute('innerText')
        seniority.append(seniority0)
    except:
        seniority.append('') #handling if seniority is not available
    
    try:
        emp_type_path = jd_path2 + '/ul/li[2]/span'
        emp_type0 = job.find_element(By.XPATH,emp_type_path).get_attribute('innerText')
        emp_type.append(emp_type0)
    except:
        emp_type.append('') #handling if employment type is not available
    
    try:
        job_func_path = jd_path2 + '/ul/li[3]/span'
        job_func0 = job.find_element(By.XPATH,job_func_path).get_attribute('innerText')
        job_func.append(job_func0)
    except:
        job_func.append('') #handling if job function is not available
    
    try:
        industries_path = jd_path2 + '/ul/li[4]/span'
        industries0 = job.find_element(By.XPATH,industries_path).get_attribute('innerText')
        industries.append(industries0)
    except:
        industries.append('')
        
    





In [None]:
job_data = pd.DataFrame({'ID': job_id,
                        'Date': date,
                        'Company': company_name,
                        'Title': job_title,
                        'Location': location,
                        'Description' : jd,
                        'Level': seniority,
                        'Type': emp_type,
                        'Function': job_func,
                        'Industry': industries
                        })


In [None]:
job_data.head(10)

In [None]:
#change output file path to your desired path. Don't forget to put the filename and filetype also
output_file_path = 'C:/Users/Miranti/Documents/_WORK/Portofolio/linkedinwebscraping/DataScience.xlsx'
job_data.to_excel(output_file_path, index = False)