In [1]:
# Data Wrangling & Other General Use
import pandas as pd
import numpy as np
import time
import random

# For scrapping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib
from urllib import parse


# For debugging
from icecream import ic
ic.configureOutput(prefix = 'Debug | ')

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}

## 1. Gathering the page full HTML code (w/ Selenium)

In [2]:
def build_url(keywords_in, location_in):
    """Pass the parameters to an url parser"""
    querystring = 'search?' + parse.urlencode({'keywords': keywords_in, 'location': location_in, 'position': 1, 'pageNum': 0})
    url = 'https://www.linkedin.com/jobs/' + querystring
    return url

In [3]:
def gather_full_html(url):
    """Gathering the page full HTML code (w/ Selenium)"""
    
    driver_path = 'C:\Program Files (x86)\chromedriver.exe'
    driver = webdriver.Chrome(driver_path)
    driver.get(url)

    # Get the number of jobs the page shows on top of the cards
    soup = BeautifulSoup(driver.page_source)
    
    try:
        nr_jobs = soup.find('span', class_ = 'results-context-header__job-count').text.strip()
        print(f'\nTotal Number of Jobs Advertised in the Top: {nr_jobs}\n')

        nr_jobs_initial = get_jobs_loaded(driver)
        print('Number of Jobs Loaded in the Browser:')
        print(f'  @ Opening Page: {nr_jobs_initial}')

        scrolls = 0
        buttons = 0

        while soup.find('div', class_ = 'inline-notification see-more-jobs__viewed-all') is None:
            # Stop when a "You've viewed all jobs" card appears

            nr_jobs_loaded_init = get_jobs_loaded(driver)

            try:
                # Click the "Show More Jobs" button
                driver.find_element_by_xpath("//button[@class='infinite-scroller__show-more-button infinite-scroller__show-more-button--visible']").click()
                buttons += 1
                buttons_print = 'Button' if buttons == 1 else 'Buttons'

                # Give the browser some time to fetch the results
                time.sleep(1)

                # Printing the number of jobs already loaded
                nr_jobs_loaded = get_jobs_loaded(driver)
                if nr_jobs_loaded != nr_jobs_loaded_init:
                    print(f'  After {buttons} {buttons_print}: {nr_jobs_loaded}')

            except:
                # Scroll through the infinite scroll until the "Show More Jobs" button appears
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                scrolls += 1
                scrolls_print = 'Scroll' if scrolls == 1 else 'Scrolls'

                time.sleep(1.2)
                nr_jobs_loaded = get_jobs_loaded(driver)
                if nr_jobs_loaded != nr_jobs_loaded_init:
                    print(f'  After {scrolls} {scrolls_print}: {nr_jobs_loaded}')


            # Refreshing the soup for assessment in the while loop condition
            soup = BeautifulSoup(driver.page_source)

        # Closing the browser
        print("\nBrowser is now closed.")
        driver.close()
    
    except:
        print('Linkedin is blocking the crawling. Wait some more and try again.')
    
    return soup


def get_jobs_loaded(driver):
    soup_jobs = BeautifulSoup(driver.page_source)
    nr_jobs = len(soup_jobs.find('ul', class_ = 'jobs-search__results-list').find_all('li'))
    return nr_jobs

In [4]:
# Select the company or the job you want to find results for
keywords_in = 'McKinsey & Company'     # 'Data Scientist McKinsey & Company'
# Select the location for it
location_in = 'Lisbon'

soup = gather_full_html(build_url(keywords_in, location_in))


Total Number of Jobs Advertised in the Top: 38

Number of Jobs Loaded in the Browser:
  @ Opening Page: 25
  After 1 Scroll: 38

Browser is now closed.


## 2. Gathering all information from the job cards (w/ BeautifulSoup)

In [5]:
def gather_job_card_info(soup):
    """Gathering all information from the job cards (w/ BeautifulSoup)"""
    
    jobs_card = soup.find('ul', class_ = 'jobs-search__results-list')

    jobs = []

    for li in jobs_card.find_all('li'):
        full_details_url = li.find('a').get('href').replace('https://pt.linkedin', 'https://linkedin')
        position = li.find('h3', class_ = 'base-search-card__title').text.strip()
        company = li.find('h4', class_ = 'base-search-card__subtitle').text.strip()
        metadata = li.find('div', class_ = 'base-search-card__metadata')
        location = metadata.find('span', class_ = 'job-search-card__location').text.strip()
        posting_date = metadata.find('time').get('datetime')

        job_info = {'Company': company,
                    'Location': location,
                    'Position': position,
                    'PostingDate': posting_date,
                    'FullDetailsURL': full_details_url[:full_details_url.find('?refId=')]}

        if job_info not in jobs:
            jobs.append(job_info)

    df_extr = pd.DataFrame(jobs)

    print(f"\nAll {len(jobs)} jobs' information is now loaded to a dataframe.\n")
    
    return df_extr

In [6]:
df_extr = gather_job_card_info(soup)


All 38 jobs' information is now loaded to a dataframe.



In [7]:
df_extr.head()

Unnamed: 0,Company,Location,Position,PostingDate,FullDetailsURL
0,McKinsey & Company,"Lisbon, Lisbon, Portugal",Junior Associate,2021-07-08,https://linkedin.com/jobs/view/junior-associat...
1,McKinsey & Company,"Lisbon, Lisbon, Portugal",Associate,2021-07-07,https://linkedin.com/jobs/view/associate-at-mc...
2,McKinsey & Company,"Lisbon, Lisbon, Portugal",Business Analyst,2021-07-07,https://linkedin.com/jobs/view/business-analys...
3,McKinsey & Company,"Lisbon, Lisbon, Portugal",Product Manager,2021-07-23,https://linkedin.com/jobs/view/product-manager...
4,McKinsey & Company,"Lisbon, Lisbon, Portugal",Junior Capabilities & Insights Analyst - Strat...,2021-07-20,https://linkedin.com/jobs/view/junior-capabili...


## 3. Gathering Full Job Info through the URL's

In [19]:
def gather_full_info(df_extr):

    try:
        # Reading previous days info from csv file
        df_full = pd.read_csv('FullInfoDataframe.csv') 
    except:
        # First instance of the dataframe 
        df_full = pd.DataFrame(columns = ['Company', 'Location', 'Position', 'PostingDate', 'FullDetailsURL', 'AllQualifications', 'Office', 'Applicants'])
        df_full.to_csv('FullInfoDataframe.csv',
                       index = False)

    print('Fetching results:\n')
    print('JobID | JobTitle | Location')

    for i in range(len(df_extr)):

        if df_extr['FullDetailsURL'][i] not in df_full['FullDetailsURL'].unique():

            job_info = df_extr.iloc[i].to_dict()

            print(i, '|', df_extr['Position'][i], '|', df_extr['Location'][i])

            job_url = df_extr['FullDetailsURL'][i]

            job_page = requests.get(job_url, headers)
            soup = BeautifulSoup(job_page.content)

            try:
                # if full_description returns None, we know Linkedin blocked the request
                full_description = soup.find('div', class_ = 'show-more-less-html__markup')

                try:
                    # Store required qualifications in a list
                    qualifications = []
                    for qualification in full_description.find_all('li'):
                        qualification = qualification.text
                        qualifications.append(qualification)

                    job_info['AllQualifications'] = qualifications

                    try:
                        # Store the office indications (some jobs actually have different offices in the description)
                        pointer = 'You will be based in our'
                        pointer_pos = full_description.text.find(pointer)
                        if pointer_pos != -1:
                            office_st = full_description.text[full_description.text.find(pointer) + len(pointer):]
                            office = office_st[:office_st.find('office')].strip()
                            # Sometimes the description is customized and 'office' won't appear
                            if len(office) > 100:
                                office = office_st[:office_st.find('.')].strip()
                        else:
                            office = ''

                        job_info['Office'] = office

                        try:
                            # Job Criteria List (Employment Type, Industries, Job Function, Seniority Level)
                            criteria = soup.find('ul', class_ = 'description__job-criteria-list')
                            criteria_boxes = soup.find_all('li', class_ = 'description__job-criteria-item')
                            criteria_list = []
                            for box in criteria_boxes:
                                criteria_header = box.find('h3').text.strip()
                                criteria_text = box.find('span').text.strip()

                                job_info[criteria_header] = criteria_text

                            try:
                                # Get a rough estimate of current applicants (if we were logged into linkedin, we would have the exact number)
                                try:
                                    job_info['Applicants'] = soup.find('span', class_ = 'num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet') \
                                                                 .text.strip()
                                except:
                                    job_info['Applicants'] = soup.find('figure', class_ = 'num-applicants__figure topcard__flavor--metadata topcard__flavor--bullet') \
                                                                 .text.strip()

                                print('  Success.')

                            except:
                                print('  Errors occurred when parsing job "Applicants"')
                        except:
                            print('  Errors occurred when parsing job "Criteria"')
                    except:
                        print('  Errors occurred when parsing job "Office"')
                except:
                    print('  Errors occurred when parsing job "Qualifications"')

            except:
                raise ValueError('LINKEDIN BLOCKED THE REQUEST')

            # Add the job dict to the dataframe
            df_full = df_full.append(job_info, ignore_index = True)

        time.sleep(random.random() * 3 + 1) # Waiting a randomized amount of time (higher than 1 and lower than 4 secs)

    df_full.to_csv('FullInfoDataframe.csv',
                   index = False)

    print('\nNumber of Jobs:', len(df_full['FullDetailsURL'].unique()))
    print()
    
    return df_full

In [None]:
df_full.head(3)

* Criar job que corra este código diariamente, para tirar os dados de:
  + Mckinsey & Company Lisboa - perceber o perfil de recrutamento neste office
  + Data Scientist McKinsey & Company - Mundialmente - perceber a stack tecnológica usada