## Imports

In [1]:
import os
import pandas as pd
import time
import random
import getpass
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from bs4 import BeautifulSoup
from tqdm import tqdm

## Initialising the driver + logging into Linkedin

In [2]:
def driver():
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
    s = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument("headless")
    options.add_argument(f'user-agent={user_agent}')
    options.add_argument("--window-size=1920,1080")
    options.binary_location = '/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'
    driver = webdriver.Chrome(service=s, options=options)

def linkedin_login():
    mail = str(getpass.getpass('Login email? '))
    pw = str(getpass.getpass('Password? '))
    driver.get('https://www.linkedin.com')
    username = driver.find_element(By.ID, 'session_key').send_keys(mail)
    password = driver.find_element(By.ID, 'session_password').send_keys(pw)
    login_button = driver.find_element(By.CLASS_NAME,'sign-in-form__submit-button')
    login_button.click()

## Data analyst positions in Barcelona published last month

In [37]:
def data_analyst():
    driver()
    linkedin_login()
    titles = []
    companies = []
    locations = []
    modalities = []
    descriptions = []
    for i in tqdm(range(0, 976, 25)):
        driver.get(f'''https://www.linkedin.com/jobs/search/?f_TPR=r2592000&geoId=107025191&keywords=data%20analyst&location=Barcelona%2C%20Catalonia%2C%20Spain&start={i}''')
        time.sleep(random.randint(3, 5))
        all_listings = []
        keep_scrolling = True

        while keep_scrolling:
            listings = driver.find_elements(By.CSS_SELECTOR,".job-card-list__title")
            if set(listings) == set(all_listings):
                keep_scrolling = False
            else:
                new_listings = list(set(listings) - set(all_listings))
                #print(len(new_listings))
                for listing in new_listings:
                    all_listings.append(listing)
                    try:
                        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth'});", listing)
                        time.sleep(random.randint(1, 3))
                        listing.click()
                        time.sleep(random.randint(2, 5))
                        page_source = driver.page_source
                        soup = BeautifulSoup(page_source, "html.parser")
                        try:
                            title = driver.find_element(By.XPATH, "//*[starts-with(@id, 'ember')]/h2").text
                            if 'data' in title.lower() or 'datos' in title.lower():
                                titles.append(title)
                                try:
                                    company = driver.find_element(By.XPATH, "//*[contains(@class, 'ember-view t-black t-normal')]").text
                                    companies.append(company)
                                except NoSuchElementException:
                                    companies.append('Unknown')
                                    #driver.save_screenshot("screenshot.png")
                                try:
                                    location = soup.select('body > div.application-outlet > div.authentication-outlet > div.job-search-ext > div.jobs-search-two-pane__wrapper > div > section.jobs-search__right-rail > div > div > div:nth-child(1) > div > div:nth-child(1) > div > div.jobs-unified-top-card__content--two-pane > div.mt2 > span.jobs-unified-top-card__subtitle-primary-grouping.mr2.t-black > span.jobs-unified-top-card__bullet')[0].get_text().strip()
                                    locations.append(location)
                                except IndexError:
                                    locations.append('Unknown')
                                    #print('location error')
                                try:
                                    modality = soup.select('span.jobs-unified-top-card__subtitle-primary-grouping.mr2.t-black > span.jobs-unified-top-card__workplace-type')[0].get_text()
                                    modalities.append(modality)
                                except IndexError:
                                    modalities.append('Unknown')
                                    #print('modality error')
                                try:
                                    description = soup.select('#job-details > span')[0].get_text()
                                    descriptions.append(description)
                                except IndexError:
                                    descriptions.append('Unknown')
                                    #print('description error')
                        except NoSuchElementException:
                            driver.save_screenshot(f"{random.randint(0, 100)}.png")
                    except StaleElementReferenceException:
                        driver.save_screenshot(f"{random.randint(0, 100)}.png")
                time.sleep(random.randint(1, 2))
    driver.quit()
    df = pd.DataFrame({'title': titles, 'company': companies, 'location': locations,
                  'modality': modalities, 'description': descriptions})
    return df



In [None]:
analyst = data_analyst()

## Data scientist positions published last month

In [38]:
def data_scientist():
    driver()
    linkedin_login()
    titles = []
    companies = []
    locations = []
    modalities = []
    descriptions = []
    for i in tqdm(range(0, 976, 25)):
        driver.get(f'''https://www.linkedin.com/jobs/search/?f_TPR=r2592000&geoId=107025191&keywords=data%20scientist&location=Barcelona%2C%20Catalonia%2C%20Spain&start={i}''')
        time.sleep(random.randint(3, 5))
        all_listings = []
        keep_scrolling = True

        while keep_scrolling:
            listings = driver.find_elements(By.CSS_SELECTOR,".job-card-list__title")
            if set(listings) == set(all_listings):
                keep_scrolling = False
            else:
                new_listings = list(set(listings) - set(all_listings))
                #print(len(new_listings))
                for listing in new_listings:
                    all_listings.append(listing)
                    try:
                        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth'});", listing)
                        time.sleep(random.randint(1, 3))
                        listing.click()
                        time.sleep(random.randint(2, 5))
                        page_source = driver.page_source
                        soup = BeautifulSoup(page_source, "html.parser")
                        try:
                            title = driver.find_element(By.XPATH, "//*[starts-with(@id, 'ember')]/h2").text
                            if 'data' in title.lower() or 'datos' in title.lower():
                                titles.append(title)
                                try:
                                    company = driver.find_element(By.XPATH, "//*[contains(@class, 'ember-view t-black t-normal')]").text
                                    companies.append(company)
                                except NoSuchElementException:
                                    companies.append('Unknown')
                                    #driver.save_screenshot("screenshot.png")
                                try:
                                    location = soup.select('body > div.application-outlet > div.authentication-outlet > div.job-search-ext > div.jobs-search-two-pane__wrapper > div > section.jobs-search__right-rail > div > div > div:nth-child(1) > div > div:nth-child(1) > div > div.jobs-unified-top-card__content--two-pane > div.mt2 > span.jobs-unified-top-card__subtitle-primary-grouping.mr2.t-black > span.jobs-unified-top-card__bullet')[0].get_text().strip()
                                    locations.append(location)
                                except IndexError:
                                    locations.append('Unknown')
                                    #print('location error')
                                try:
                                    modality = soup.select('span.jobs-unified-top-card__subtitle-primary-grouping.mr2.t-black > span.jobs-unified-top-card__workplace-type')[0].get_text()
                                    modalities.append(modality)
                                except IndexError:
                                    modalities.append('Unknown')
                                    #print('modality error')
                                try:
                                    description = soup.select('#job-details > span')[0].get_text()
                                    descriptions.append(description)
                                except IndexError:
                                    descriptions.append('Unknown')
                                    #print('description error')
                        except NoSuchElementException:
                            driver.save_screenshot(f"{random.randint(0, 100)}.png")
                    except StaleElementReferenceException:
                        driver.save_screenshot(f"{random.randint(0, 100)}.png")
                time.sleep(random.randint(1, 2))
    driver.quit()
    df = pd.DataFrame({'title': titles, 'company': companies, 'location': locations,
                  'modality': modalities, 'description': descriptions})
    return df


In [None]:
scientist = data_scientist()

## Putting everything together

In [None]:
df = pd.concat([analyst, scientist]).reset_index(drop=True)

## Dropping duplicates & saving the dataframe

In [34]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412 entries, 0 to 411
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        412 non-null    object
 1   company      412 non-null    object
 2   location     412 non-null    object
 3   modality     412 non-null    object
 4   description  412 non-null    object
dtypes: object(5)
memory usage: 16.2+ KB


In [35]:
df.to_csv('analyst_scientist_bcn_last_month_clean.csv', index=None, header=True)