# IMPORTING THE REQUIRED LIBRARIES

First, we begin by importing the required libraries.

In [1]:
# Importing the standard libraries
import time
import pandas as pd
import numpy as np

# Importing Selenium library and relevant classes
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains

# USING SELENIUM, SETTING UP LINKEDIN, AND LOGGING IN

Next, we will install the ```Selenium``` webdriver, load the LinkedIn webpage, and sign in.

In [2]:
# Installing the Selenium Chrome web driver
# No external files need to be downloaded with this method of utilizing Selenium
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# Fetching the LinkedIn login page
driver.get('https://www.linkedin.com/login');
time.sleep(1)

# Maximize Window
driver.maximize_window()  
driver.switch_to.window(driver.current_window_handle)
driver.implicitly_wait(10)

# Accepting page cookies
driver.find_element("xpath", "/html/body/div/main/div[1]/div/section/div/div[2]/button[2]").click()

# User credentials to be used for the sign in
# The account username is John Doe
user_name = "michael.merheb@alumni.esade.edu"
password = "CloudComputing"

# Filling in the LinkedIn account username
driver.find_element("xpath", '//*[@id="username"]').send_keys(user_name)
time.sleep(1)

# Filling in the LinkedIn account password 
driver.find_element("xpath", '//*[@id="password"]').send_keys(password)
time.sleep(1)

# Clicking on the login button
driver.find_element("xpath",  '//*[@id="organic-div"]/form/div[3]/button').click()
driver.implicitly_wait(30)

# LOADING THE LINKEDIN JOB SEARCH PAGE

The specific LinkedIn job search page is loaded. Again, we are scraping LinkedIn for all Data Scientist jobs in the Barcelona area.

In [3]:
# Going to the LinkedIn job search results page
driver.get("https://www.linkedin.com/jobs/search/?currentJobId=3270287326&geoId=107025191&keywords=data%20scientist%2C%20barcelona&location=Barcelona%2C%20Catalonia%2C%20Spain&refresh=true")
time.sleep(1)

print('You are ready and set to begin scraping!')

You are ready and set to begin scraping!


# SCRAPING LINKEDIN FOR THE DIFFERENT JOB DATA POSTED

Next, we scrape the pages for all the data for the relevant job postings.

In [5]:
job_title = []
company_name = []
company_location = []
state = []
posting_date = []
links = []
number_of_applicants = []
promoted = []
workspace = []
seniority = []
employment_type = []
industry = []
python_required = []
apply_through_linkedin = []
number_of_employees = []
number_of_followers = [] # BONUS QUESTION


# Navigating the 40 pages of job postings on LinkedIn
print('LinkedIn job Data is being scraped, please wait...')

for page in range(2,3):
    time.sleep(1)
    jobs_block = driver.find_element(By.CSS_SELECTOR, ".jobs-search-results-list")
    jobs_list= jobs_block.find_elements(By.CSS_SELECTOR, '.jobs-search-results__list-item')

    for job in jobs_list:
        all_links = job.find_elements(By.TAG_NAME, 'a')
        for link in all_links:
            if str(link.get_attribute('href')).startswith("https://www.linkedin.com/jobs/view") and link.get_attribute('href') not in links: 
                links.append(link.get_attribute('href'))
            else:
                pass
        
        # Check whether the job is promoted or not
        promoted.append('Promoted' in job.text)
        
        # Selecting the title of the job card
        title_job_card = job.find_element(By.CSS_SELECTOR, '.job-card-container__link.job-card-list__title')
        title_job_card.click()

        time.sleep(1)

        # Moving to the right pane in order to obtain the other job data
        content = driver.find_element(By.CSS_SELECTOR, '.job-view-layout.jobs-details')

        # Obtaining the job title
        try: 
            job_title.append(content.find_element(By.CSS_SELECTOR, ".t-24").text)
        except:
            job_title.append(np.nan)

        # Obtaining the company name
        try:
            company_name.append(content.find_element(By.CLASS_NAME, "jobs-unified-top-card__company-name").text)
        except:
            company_name.append(np.nan)

        # Obtaining the company location
        try: 
            company_location.append(content.find_element(By.CLASS_NAME, "jobs-unified-top-card__bullet").text)
        except:
            company_location.append(np.nan)

        # Obtaining the job posting date
        try: 
            posting_date.append(content.find_element(By.CSS_SELECTOR, ".jobs-unified-top-card__posted-date").text)
        except:
            posting_date.append(np.nan)
        
        # Obtaining the job's number of applicants
        try: 
            number_of_applicants.append(content.find_element(By.CSS_SELECTOR, ".jobs-unified-top-card__applicant-count").text)
        except:
            number_of_applicants.append(np.nan)

        # Obtaining the industry that the company is in
        try: 
            industry.append(content.find_elements(By.CSS_SELECTOR, '.mt5.mb2 ul li span')[1].text)
        except:
            industry.append(np.nan)

        # Obtaining whether the job is hybrid, on-site, remote, or other
        try: 
            workspace.append(content.find_element(By.CLASS_NAME, "jobs-unified-top-card__workplace-type").text)
        except:
            workspace.append(np.nan)

        # Obtaining the job seniority
        try: 
            seniority.append(content.find_elements(By.CSS_SELECTOR, '.mt5.mb2 ul li span')[0].text)
        except:
            seniority.append(np.nan)

         # Obtaining the job employment type
        try: 
            employment_type.append(content.find_elements(By.CSS_SELECTOR, '.mt5.mb2 ul li span')[0].text)
        except:
            employment_type.append(np.nan)

        # Checking whether applying through LinkedIn is an option (True), otherwise False
        try: 
            apply_through_linkedin.append(content.find_elements(By.CSS_SELECTOR, ".artdeco-button__text")[0].text)
        except:
            apply_through_linkedin.append(np.nan)

        # Checking the status of the application lifecycle
        try: 
            state.append(content.find_elements(By.CSS_SELECTOR, '.mt5.mb2 ul li span')[-1].text)
        except:
            state.append(np.nan)

        # Find if Python is required for the job
        contents = driver.find_elements(By.CSS_SELECTOR, ".jobs-box__html-content.jobs-description-content__text.t-14.t-normal span")
        for content in contents:
            try:
                if 'Python' or 'python' in python_required.append(content).text:
                    python_required.append(True)
                else:
                    python_required.append(False)
            except:
                python_required.append(np.nan) 
            pass

        # Finding the company's number of followers
        try:
            company_info_at_bottom_of_page = driver.find_element(By.CSS_SELECTOR, '.jobs-company.jobs-box--fadein.mb4.jobs-company--two-pane') # Defining the location of the bottom of the page
            driver.execute_script("arguments[0].scrollIntoView();", company_info_at_bottom_of_page) # Scrolling down the page on the right till company info card appears
        
            content = driver.find_element(By.CSS_SELECTOR, ".jobs-company__box")

        except:
            pass
          
        # Getting the company number of followers -> BONUS QUESTION
        try: 
            number_of_followers.append(content.find_element(By.CSS_SELECTOR, ".artdeco-entity-lockup__subtitle").text)
        except:
            number_of_followers.append(np.nan)

        driver.execute_script("arguments[0].scrollIntoView();", job)
    
    print(f'Collecting the links in the page: {page-1}')

    # go to next page:
    driver.find_element("xpath", f"//button[@aria-label='Page {page}']").click()
    
    time.sleep(1)
    
print('Done! In total, ' + str(len(links)) + ' links for job offers were found!')

LinkedIn job Data is being scraped, please wait...
Collecting the links in the page: 1
Done! In total, 25 links for job offers were found!


# HANDLING THE DATA OBTAINED

Now we have all the data we need. However, before creating the ```DataFrame```, the data obtained needs to be in a specific data type as outlined at the beginning of the documentation.
To do this, some data types need to stay as ```object``` but the string needs to be cleaned from any unnecessary spaces. In addition, some data types need to be changed to ```int``` or ```bool```.

In [6]:
# Dealing with the state of the job posted
state_new = []
for state_element in state:
    if state_element =='Actively recruiting':
        state_new.append('On-going')
    elif state_element == 'Early Applicant':
        state_new.append('Early Applications')
    else:
        state_new.append('Others')

# Dealing with the number of applicants
number_of_applicants_new = []
for applicant in number_of_applicants:
    if type(applicant) == float:
        number_of_applicants_new.append(np.nan)
    else:
        if int(applicant.split(' ')[0]) < 25:
            number_of_applicants_new.append(int(('25')))
        else:
            number_of_applicants_new.append(int(applicant.split(' ')[0]))

# Dealing with seniority and employment type
seniority_new = []
employment_type = []
for element in seniority:
    employment_type.append(element.split(' · ')[0])
    try:
        seniority_new.append(element.split(' · ')[1])
    except:
        seniority_new.append(np.nan)

# Dealing with the industry the job is in
industry_new = []
for element in industry:
    try:
        industry_new.append(element.split(' · ')[1])
    except:
        industry_new.append(np.nan)

# Dealing with the number of employees of the company
employees_range_data = []
number_of_employees = []
for element in industry:
    employees_range_data.append(element.split(' · ')[0].split(' ')[0])

for element_x in employees_range_data:
    value_to_append = element_x.split('-')[-1]
    number_of_employees.append(int(value_to_append.split('+')[0].replace(',','')))

#Dealing with the option of applying through LinkedIn
apply_through_linkedin_new = []
for result in apply_through_linkedin:
    if result == "Easy Apply":
        apply_through_linkedin_new.append(True)
    else:
        apply_through_linkedin_new.append(False)

# Dealing with the company number of followers --> BONUS
# The value was obtained in integer form before using in the DataFrame
number_of_followers_new = []
try:
    for element in number_of_followers:
        number_of_followers_new.append(int(element.split()[0].replace(',','')))
except:
    pass

# MERGING THE DATA INTO A DATAFRAME

Now that we have the data ready, we can go ahead and create the ```DataFrame```!

In [12]:
# Creating the DataFrame
data = {'Job Title':job_title, 'Company Name':company_name, 'Company Location':company_location, 'State':state_new, 'Posting Date':posting_date, 'Offer URL':links,
'Number of Applicants':number_of_applicants_new, 'Promoted':promoted, 'Workspace':workspace, 'Seniority':seniority_new, 'Employment Type':employment_type,
'Industry': industry_new, 'Python Required':python_required, 'Application through LinkedIn':apply_through_linkedin_new, 'Number of employees':number_of_employees}
df = pd.DataFrame(data)

# Checking the size of our DataFrame
print("The size of our resulting DataFrame is", df.shape[0], "rows &", df.shape[1], "columns!")

# Printing the first 5 instances of the DataFrame
print("Below we can see the first instances of our DataFrame:")
df.head(5)

The size of our resulting DataFrame is 25 rows & 15 columns!
Below we can see the first instances of our DataFrame:


Unnamed: 0,Job Title,Company Name,Company Location,State,Posting Date,Offer URL,Number of Applicants,Promoted,Workspace,Seniority,Employment Type,Industry,Python Required,Application through LinkedIn,Number of employees
0,Tech Hub - Data Scientist - Catman,Media Markt Iberia,"El Prat de Llobregat, Catalonia, Spain",Others,1 month ago,https://www.linkedin.com/jobs/view/3370446510/...,25.0,False,Hybrid,Entry level,Contract,Computers and Electronics Manufacturing,True,False,10000
1,B2B Sales Specialist (Spanish-speaking),Venuu,"Barcelona, Catalonia, Spain",Others,8 minutes ago,https://www.linkedin.com/jobs/view/3374503275/...,29.0,True,Hybrid,,Full-time,,True,False,50
2,Kubernetes Platform Engineer Focusing On Adoption,Roche,"Barcelona, Catalonia, Spain",On-going,1 week ago,https://www.linkedin.com/jobs/view/3373767266/...,25.0,True,On-site,Associate,Full-time,Biotechnology Research,True,False,10001
3,Data Scientist (m/f/d),ZF Group,"Barcelona, Catalonia, Spain",On-going,1 day ago,https://www.linkedin.com/jobs/view/3387401304/...,41.0,False,Hybrid,Entry level,Full-time,Motor Vehicle Manufacturing,True,False,10001
4,Data Scientist,Product Madness 📱🎮,"Barcelona, Catalonia, Spain",On-going,1 day ago,https://www.linkedin.com/jobs/view/3361150448/...,128.0,False,Hybrid,Entry level,Full-time,Computer Games,True,False,1000


In [11]:
set(employment_type)

{'Contract', 'Full-time'}

We can finally perform a sanity check on the ```data types``` of our ```DataFrame``` values obtained.

In [None]:
df.dtypes

Job Title                        object
Company Name                     object
Company Location                 object
State                            object
Posting Date                     object
Offer URL                        object
Number of Applicants            float64
Promoted                           bool
Workspace                        object
Seniority                        object
Employment Type                  object
Industry                         object
Python Required                    bool
Application through LinkedIn       bool
Number of employees               int64
Number of followers               int64
dtype: object

In [9]:
len(number_of_employees)

25