In [1]:
# Data Wrangling & Other General Use
import pandas as pd
import time
import random
from datetime import datetime

# For scrapping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib
from urllib import parse

# For handling files
import os
from os.path import isfile, join, splitext

# For debugging
from icecream import ic
ic.configureOutput(prefix = 'Debug | ')

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
start = time.time()

## 1. Gathering the page full HTML code (w/ Selenium)

In [2]:
def build_url(keywords_in, location_in):
    """Pass the parameters to an url parser"""
    querystring = 'search?' + parse.urlencode({'keywords': keywords_in, 'location': location_in, 'position': 1, 'pageNum': 0})
    url = 'https://www.linkedin.com/jobs/' + querystring
    return url

In [3]:
def gather_full_html(url):
    """Gathering the page full HTML code (w/ Selenium)"""
    
    print('STAGE 1: GATHERING THE PAGE FULL HTML CODE -----------------------------------------\n')
    
    #driver_path = 'C:\Program Files (x86)\chromedriver.exe'
    driver_path = 'chromedriver.exe'
    driver = webdriver.Chrome(driver_path)
    driver.get(url)

    close = 0
    while close == 0:
        
        start_while = time.time() 
    
        # Get the number of jobs the page shows on top of the cards
        soup = BeautifulSoup(driver.page_source, "lxml")

        try:
            # Click the "Accept Cookies" button, if it displays
            try:
                driver.find_element_by_xpath("//button[@class='artdeco-global-alert-action artdeco-button artdeco-button--inverse artdeco-button--2 artdeco-button--primary'] \
                                                       and @data-tracking-control-name='ga-cookie.consent.accept.v3'") \
                      .click()
                print('Cookies Accepted.\n')
            except:
                pass

            nr_jobs = soup.find('span', class_ = 'results-context-header__job-count').text.strip()
            print(f'Total Number of Jobs Advertised in the Top: {nr_jobs}\n')

            nr_jobs_initial = get_jobs_loaded(driver)
            print('Number of Jobs Loaded in the Browser:')
            print(f'  @ Opening Page: {nr_jobs_initial}')

            scrolls = 0
            buttons = 0

            while soup.find('div', class_ = 'inline-notification see-more-jobs__viewed-all') is None:
                # Stop when a "You've viewed all jobs" card appears

                nr_jobs_loaded_init = get_jobs_loaded(driver)

                try:
                    # Click the "Show More Jobs" button
                    driver.find_element_by_xpath("//button[@class='infinite-scroller__show-more-button infinite-scroller__show-more-button--visible']").click()
                    buttons += 1
                    buttons_print = 'Button' if buttons == 1 else 'Buttons'

                    # Give the browser some time to fetch the results
                    time.sleep(1.2)

                    # Printing the number of jobs already loaded
                    nr_jobs_loaded = get_jobs_loaded(driver)
                    if nr_jobs_loaded != nr_jobs_loaded_init:
                        print(f'  After {buttons} {buttons_print}: {nr_jobs_loaded}')

                except:
                    
                    # Scroll through the infinite scroll until the "Show More Jobs" button appears
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    scrolls += 1
                    scrolls_print = 'Scroll' if scrolls == 1 else 'Scrolls'

                    time.sleep(1.8)
                    nr_jobs_loaded = get_jobs_loaded(driver)
                    if nr_jobs_loaded != nr_jobs_loaded_init:
                        print(f'  After {scrolls} {scrolls_print}: {nr_jobs_loaded}')
                
                finally:
                    # Refreshing the soup for assessment in the while loop condition
                    soup = BeautifulSoup(driver.page_source, "lxml")
                    
                    #If for some reason the page is taking too long to load, start again
                    if time.time() - start_while > 100:
                        print('Taking a while. Maybe it is better to restart.')
                        break

            # Closing the browser
            print("\nBrowser is now closed.")
            driver.close()

            print('\n------------------------------------------------------------------------------------\n')
            
            # If we were not blocked, close the loop
            close = 1
            
        except:
            driver.close()
            sleep_if_blocked = 30
            print(f'Linkedin is blocking the crawling. Waiting {sleep_if_blocked} seconds to try again.')
            time.sleep(sleep_if_blocked)
            
    return soup

def get_jobs_loaded(driver):
    soup_jobs = BeautifulSoup(driver.page_source)
    nr_jobs = len(soup_jobs.find('ul', class_ = 'jobs-search__results-list').find_all('li'))
    return nr_jobs

## 2. Gathering all information from the job cards (w/ BeautifulSoup)

In [4]:
def gather_job_card_info(soup):
    """Gathering all information from the job cards (w/ BeautifulSoup)"""
    
    print('STAGE 2: GATHERING ALL INFORMATION FROM THE JOB CARDS ------------------------------\n')
    
    jobs_card = soup.find('ul', class_ = 'jobs-search__results-list')

    jobs = []
    repeated_jobs = []
    
    for li in jobs_card.find_all('li'):
        full_details_url = li.find('a').get('href').replace('https://pt.linkedin', 'https://linkedin')
        position = li.find('h3', class_ = 'base-search-card__title').text.strip()
        company = li.find('h4', class_ = 'base-search-card__subtitle').text.strip()
        metadata = li.find('div', class_ = 'base-search-card__metadata')
        location = metadata.find('span', class_ = 'job-search-card__location').text.strip()
        posting_date = metadata.find('time').get('datetime')

        job_info = {'Company': company,
                    'Location': location,
                    'Position': position,
                    'PostingDate': posting_date,
                    'FullDetailsURL': full_details_url[:full_details_url.find('?refId=')]}

        if job_info not in jobs:
            jobs.append(job_info)
        else:
            if len(repeated_jobs) == 0:
                print('Repeated Jobs:')
            repeated_jobs.append(job_info)
            print(job_info['Company'], '|', job_info['Position'])

    df_extr = pd.DataFrame(jobs)

    print(f"\n{len(jobs)} unique jobs found. Full info now loaded to a dataframe.")
    print('\n------------------------------------------------------------------------------------\n')
    
    return df_extr

## 3. Gathering Full Job Info through the URL's

In [5]:
def gather_full_info(df_extr):
    
    print("STAGE 3: GATHERING FULL JOB INFO THROUGH THE URL'S ---------------------------------\n")

    # First instance of the dataframe 
    df_full = pd.DataFrame(columns = ['ResultsVersion', 'ResultsDate', 'Company', 'Location',
                                      'Position', 'PostingDate', 'FullDetailsURL', 'AllQualifications', 'Applicants'])

    print('Fetching results:\n')
    print('JobID | JobTitle | Company')
    
    last_version = df_full['ResultsVersion'].max() if len(df_full) > 0 else 0

    for i in range(len(df_extr)):

        job_info = df_extr.iloc[i].to_dict()
        # Save the process datetime (day & hour) and a version ID
        job_info['ResultsVersion'] = last_version + 1
        job_info['ResultsDate'] = datetime.now().strftime("%d/%m/%Y %Hh")

        print(i, '|', df_extr['Position'][i], '|', df_extr['Company'][i])

        job_url = df_extr['FullDetailsURL'][i]

        job_page = requests.get(job_url, headers)
        soup = BeautifulSoup(job_page.content, "lxml")

        try:
            # if full_description returns None, we know Linkedin blocked the request
            full_description = soup.find('div', class_ = 'show-more-less-html__markup')

            try:
                # Store required qualifications in a list
                qualifications = []
                for qualification in full_description.find_all('li'):
                    qualification = qualification.text
                    qualifications.append(qualification)

                job_info['AllQualifications'] = qualifications

                try:
                    # Job Criteria List (Employment Type, Industries, Job Function, Seniority Level)
                    criteria = soup.find('ul', class_ = 'description__job-criteria-list')
                    criteria_boxes = criteria.find_all('li', class_ = 'description__job-criteria-item')
                    for box in criteria_boxes:
                        criteria_header = box.find('h3').text.strip()
                        criteria_text = box.find('span').text.strip()

                        job_info[criteria_header] = criteria_text

                    try:
                        # Get the info regarding current applicants
                        # If we were logged into Linkedin, we would have the exact number for those jobs under 25 applicants
                        try:
                            job_info['Applicants'] = soup.find('span', class_ = 'num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet') \
                                                         .text.strip()
                        except:
                            #job_info['Applicants'] = soup.find('figure', class_ = 'num-applicants__figure topcard__flavor--metadata topcard__flavor--bullet') \
                            #                             .text.strip()
                            job_info['Applicants'] = soup.find('figcaption', class_ = 'num-applicants__caption') \
                                                         .text.strip()
                            
                            
                    except:
                        print('     Errors occurred when parsing job "Applicants"')
                except:
                    print('     Errors occurred when parsing job "Criteria"')
            except:
                print('     Errors occurred when parsing job "Qualifications"')

        except:
            raise ValueError('LINKEDIN BLOCKED THE REQUEST')

        # Add the job dict to the dataframe
        df_full = df_full.append(job_info, ignore_index = True)

        time.sleep(random.random() * 3 + 1) # Waiting a randomized amount of time (higher than 1 and lower than 4 secs)

    df_full.to_csv(next_file_to_write(),
                   index = False,
                   encoding = 'utf-8-sig')
    
    print('\n------------------------------------------------------------------------------------\n')
    
    return df_full


def next_file_to_write():
    '''Returning the next filename to write,
    in order to refresh temporary files'''
    
    working_path = os.getcwd()

    # Listing all csv files
    csv_files = [file for file in os.listdir(working_path) \
                 if isfile(join(working_path, file)) \
                 and splitext(join(working_path, file))[1] == '.csv']

    csv_temp_files = [file for file in csv_files if file.startswith('Run')]

    # Next File to write
    next_file = 'Run' + str(len(csv_temp_files) + 1) + '.csv'
    
    return next_file

### Running the whole process

In [6]:
# INPUTS -------------------------------------------------------
# Select the company or the job you want to find results for
keywords_in = '"Data Scientist"'
# Select the location for it
location_in = 'Lisbon'
# --------------------------------------------------------------

In [7]:
# STAGE 1
soup = gather_full_html(build_url(keywords_in, location_in))

STAGE 1: GATHERING THE PAGE FULL HTML CODE -----------------------------------------

Total Number of Jobs Advertised in the Top: 82

Number of Jobs Loaded in the Browser:
  @ Opening Page: 25
  After 1 Scroll: 50
  After 2 Scrolls: 75
  After 3 Scrolls: 82

Browser is now closed.

------------------------------------------------------------------------------------



In [8]:
# STAGE 2
df_extr = gather_job_card_info(soup)
df_extr.head()

STAGE 2: GATHERING ALL INFORMATION FROM THE JOB CARDS ------------------------------


82 unique jobs found. Full info now loaded to a dataframe.

------------------------------------------------------------------------------------



Unnamed: 0,Company,Location,Position,PostingDate,FullDetailsURL
0,Siemens,"Lisbon, Lisbon, Portugal",Data Scientist (m/f/d),2021-08-08,https://linkedin.com/jobs/view/data-scientist-...
1,McKinsey & Company,"Lisbon, Lisbon, Portugal",Data Scientist,2021-09-04,https://linkedin.com/jobs/view/data-scientist-...
2,Nokia,"Amadora, Lisbon, Portugal",Data Scientist (Traineeship),2021-07-13,https://linkedin.com/jobs/view/data-scientist-...
3,Siemens,"Lisbon, Lisbon, Portugal",Junior Data Scientist (m/f/d),2021-08-01,https://linkedin.com/jobs/view/junior-data-sci...
4,SGS,"Lisboa, Lisbon, Portugal",Data Scientist,2021-07-25,https://linkedin.com/jobs/view/data-scientist-...


In [9]:
# STAGE 3
df_full = gather_full_info(df_extr)
df_full.head(3)

STAGE 3: GATHERING FULL JOB INFO THROUGH THE URL'S ---------------------------------

Fetching results:

JobID | JobTitle | Company
0 | Data Scientist (m/f/d) | Siemens
1 | Data Scientist | McKinsey & Company
2 | Data Scientist (Traineeship) | Nokia
3 | Junior Data Scientist (m/f/d) | Siemens
4 | Data Scientist | SGS
5 | Data Scientist | Winning
6 | Data Scientist, Portugal | CI&T
7 | Data Scientist | Feedzai
8 | Data Scientist (M/F) - Lisbon | Capgemini Engineering
9 | Consultant, Analytics, Data and Services | Mastercard
10 | Data Scientist (PhD) | CGI
11 | Data Scientist | BOLD by Devoteam
12 | Senior Data Scientist | YData
13 | Quantitative Research - Data Scientist | BNP Paribas CIB
14 | Data Scientist | BOLD by Devoteam
15 | Senior Data Scientist | Nokia
16 | Product Data Scientist | CASAFARI
17 | Data Scientist - m/f | Michael Page
18 | Data Scientist | Worldpanel by Kantar
19 | Senior Data Scientist | Tripadvisor
20 | Data Scientist | BOLD by Devoteam
21 | Data Scientist | BOLD

Unnamed: 0,ResultsVersion,ResultsDate,Company,Location,Position,PostingDate,FullDetailsURL,AllQualifications,Applicants,Employment type,Industries,Job function,Seniority level
0,1,07/09/2021 15h,Siemens,"Lisbon, Lisbon, Portugal",Data Scientist (m/f/d),2021-08-08,https://linkedin.com/jobs/view/data-scientist-...,"[Work with large, complex data sets and applyi...",Be among the first 25 applicants,Full-time,Electrical/Electronic Manufacturing,Information Technology,Mid-Senior level
1,1,07/09/2021 15h,McKinsey & Company,"Lisbon, Lisbon, Portugal",Data Scientist,2021-09-04,https://linkedin.com/jobs/view/data-scientist-...,[Master’s degree in a quantitative field like ...,Be among the first 25 applicants,Full-time,"Automotive, Aviation & Aerospace, and Manageme...","Consulting, Information Technology, and Marketing",Associate
2,1,07/09/2021 15h,Nokia,"Amadora, Lisbon, Portugal",Data Scientist (Traineeship),2021-07-13,https://linkedin.com/jobs/view/data-scientist-...,"[Empowered, open environment where everyone is...",49 applicants,Full-time,Information Technology and Services and Teleco...,Engineering and Information Technology,Not Applicable


In [10]:
end = time.time()
print("Run Time: " + str('%.1f' % round((end - start) / 60, 1)) + " min")

Run Time: 5.1 min


Logic to update files:
* "01. DataExtraction":
  1. checks how many temporary files do we have
  1. produces new file with index max + 1
* "02. DataValidation" deletes all temporary files from the folder at the end of the day