In [5]:
#########################
## Mark Bjerregaard     #
## MARKBJ@UMICH.EDU     #
#########################
'''
Linkedin job scrape script
Scrapes jobs using selenium and chromedriver available here: https://chromedriver.chromium.org/

Steps:
1. Enter Linkedin log-in page
2. Spot the cookies pop-up and accept cookies
3. Fill E-Mail Adress and Password areas and click login
4. Click on the jobs from the section above
5. Search for job positions Data Analyst 
6. Scroll till end of page, collecting links on the way
7. Go to the next page when it is the end of the page while keep collecting links
8. After all links are collected, go to each link
9. Click the see more button to expand the job description text
10. Scrape the desired data


Items to be scraped 
1. Job title 
2. Company name
3. Company location
4. job description
5. work method (hybrid, remote, on-site)
6. Post date
'''
''

''

In [37]:
import time
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import json

page_iterations = 50
search_area = 'Copenhagen Metropolitan Area'
search_position = 'Data Analyst'

def open_cache(cache_name):
    '''
    opens the cache file if it exists and loads the JSON into a dictionary, which it then returns
    If the cahce file doesn't exist, creates a new chace dictionary
    Parameters
    ---
    None 
    Returns
    ---
    The opened cache
    '''
    try:
        cache_file = open(cache_name, 'r')
        cache_contents = cache_file.read()
        cache_dict = json.loads(cache_contents)
        cache_file.close()
    except:
        cache_dict = {}
    return cache_dict

def save_cache(cache_dict, cache_name):
    '''
    saves the current state of the cache to disk
    Parameters
    ---
    cache_dict: dict
        the dictionary to save
    Returns
    ---
    None
    '''
    dumped_json_cache = json.dumps(cache_dict, indent=4)
    fw = open(cache_name,'w')
    fw.write(dumped_json_cache)
    fw.close()

def job_scrape(driver):
    '''
    Function for scraping page of job links
    Input: chromedriver (current webpage)
    output: list of job site links
    ''' 
    links_cache = 'link_cache.json'
    link_dict = open_cache(links_cache)

    link_count = max(list(link_dict.values()))
    jobs_list = driver.find_elements(By.CLASS_NAME, 'job-box')
    for job in jobs_list:
        link = job.get_attribute('href')
        if not link_dict.get(link):
            link_dict[job.get_attribute('href')] = link_count + 1
            link_count += 1 

    save_cache(link_dict, links_cache)

    return link_dict

#Driver's path
options = Options()
options.add_argument('start-maximized')
driver = webdriver.Chrome(service =Service('chromedriver.exe'), options=options)
driver.implicitly_wait(10)

# Open linkedin page
driver.get('https://graduateland.com/jobs')
time.sleep(2)

# Filtering on area
driver.find_element(By.XPATH, '//*[@id="search-filters"]/div[46]/div/div[1]/input').send_keys(search_area)
driver.find_element(By.XPATH, '//*[@id="search-filters"]/div[46]/div/div[1]/input').click()
driver.find_element(By.XPATH, '//*[@id="search-filters"]/div[46]/div/div[2]/label[2]').click()

# Entering position filter keywords
driver.find_element(By.XPATH, '//*[@id="job-search-form"]/div[1]/div[1]/div[1]/input').send_keys(search_position, Keys.ENTER)
time.sleep(3)

#initializing scrape loop at page 1
page = 1 
print('scraping job links, page:', page)
links = job_scrape(driver)

#pages we want to scrape


for i in range(1, page_iterations):
    if i < 3:
        index = i * 2 
    elif i < 6:
        index = i + 2
    else:
        i = 7
    driver.find_element(By.XPATH, '//*[@id="timeline"]/div[3]/div/a[{}]'.format(index)).click()
    page += 1
    print('scraping job links, page:', page)
    time.sleep(3)
    links = job_scrape(driver)
    if i % 20 == 0:
        time.sleep(60)



scraping job links, page: 1
scraping job links, page: 2
scraping job links, page: 3
scraping job links, page: 4
scraping job links, page: 5
scraping job links, page: 6
scraping job links, page: 7
scraping job links, page: 8
scraping job links, page: 9
scraping job links, page: 10
scraping job links, page: 11
scraping job links, page: 12
scraping job links, page: 13
scraping job links, page: 14
scraping job links, page: 15
scraping job links, page: 16
scraping job links, page: 17
scraping job links, page: 18
scraping job links, page: 19
scraping job links, page: 20
scraping job links, page: 21
scraping job links, page: 22
scraping job links, page: 23
scraping job links, page: 24
scraping job links, page: 25
scraping job links, page: 26
scraping job links, page: 27
scraping job links, page: 28
scraping job links, page: 29
scraping job links, page: 30
scraping job links, page: 31
scraping job links, page: 32
scraping job links, page: 33
scraping job links, page: 34
scraping job links, pag

99
