In [16]:
import requests
from utils.api import get_pole_emploi_access_token
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from utils.entities import Annonce
import json
import time

In [2]:
sources = { 'apec': 1, 'pole-emploi': 2, 'linkedin': 3 }
contracts = {'cdi': 1, 'cdd': 2, 'stage': 3, 'alternance': 4}

## Scrape on APEC

In [6]:
# Scrape on apec
driver = webdriver.Chrome()
CSS = By.CSS_SELECTOR
job_page_url = 'https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles=data&typesConvention=143684&typesConvention=143685&typesConvention=143686&typesConvention=143687'
jobs_apec = []
for i in range(10):
    driver.get(f'{job_page_url}&page={i}')
    if(i == 0):
        try:
            accept_cookies = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler"))
            )
        except:
            driver.quit()
        accept_cookies.click()

    #  job_links = driver.find_elements(CSS, '.container-result div > a')
    try:
        job_links = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((CSS, ".container-result div > a"))
        )
    except:
        driver.quit()

    # Get url, title and company name
    job_links = [
        (
            job_link.get_attribute('href'),
            job_link.find_element(CSS, 'h2.card-title').text,
            job_link.find_element(CSS, 'p.card-offer__company').text,
            job_link.find_element(CSS, 'li[title="Date de publication"]').text
        )
        for job_link in job_links
    ]

    # Loop through job links
    for url, title, company, date in job_links:
        
        # Open the job page
        driver.get(url)
        
        # Wait before looking for elements
        try:
            details = WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((CSS, 'ul.details-offer-list.mb-20 > li'))
            )
        except:
            print("quiting")
            driver.quit()
        
        # entreprise
        company = details[0].text
        # type de contrat
        contrat = details[1].find_element(CSS, 'span').text
        # lieu de travail
        location = details[2].text
        # description
        description = driver.find_element(CSS, '.details-post > p').text
        # profil recherché
        profile = driver.find_element(CSS, '.details-post > p:nth-child(4)').text
        detail_posts = driver.find_elements(CSS, 'apec-poste-informations > .row.border-T > .col-lg-4 > .details-post')
        # Secteur d’activité du poste
        activity = detail_posts[6].find_element(CSS, 'span').text
        # Le nom du poste
        poste = detail_posts[3].find_element(CSS, 'span').text
        
        # competences
        competences = driver.find_elements(CSS, '.details-post .added-skills-container')
        try:
            see_more = competences[2].find_element(CSS, '.added-skills-language + p.m-0')
            see_more.click()
            hard_skills = competences[2].find_elements(CSS, '.added-skills-language')
            hard_skills = [skill.text for skill in hard_skills]
        except:
            hard_skills = []

        # Print or store the job details
        jobs_apec.append({
            'url': url,
            'title': title,
            'company': company,
            'contrat': contrat,
            'location': location,
            'date': date,
            'description': description,
            'profile': profile,
            'skills': hard_skills,
            'poste': poste,
            'activity': activity,
            'source': 'apec',
        })
# Close the webdriver
driver.quit()
len(jobs_apec)

200

In [7]:
with open('./data/apec200.json', 'w+') as f:
    json.dump(jobs_apec, f)

## Scrape on Pôle emploi

In [23]:
# access_token = get_pole_emploi_access_token()
access_token = 'Kp1H-cy_pYBlhdLx8QXOCkea0WQ'
header = {'Authorization': f'Bearer {access_token}'}
jobs_pole_emploi = []
for i in range(2):
    response = requests.get(f'https://api.pole-emploi.io/partenaire/offresdemploi/v2/offres/search?range={i*100}-{((i+1) * 100) - 1}&motsCles=data', headers=header)
    pole_emploi_response = response.json()['resultats']
    
    for job in pole_emploi_response:
        jobs_pole_emploi.append({
            'url': job['origineOffre']['urlOrigine'],
            'title': job['intitule'],
            'company': job['entreprise'].get('nom', 'Unknown'),
            'contrat': job['typeContrat'],
            'location': job['lieuTravail']['libelle'],
            'date': job['dateCreation'],
            'description': job['description'],
            'profile': '',
            'skills': [c['libelle'] for c in job.get('competences', [])],
            'poste': job['appellationlibelle'],
            'activity': job.get('secteurActiviteLibelle', 'Unknown'),
            'source': 'pole-emploi',
        })
    time.sleep(1)
len(jobs_pole_emploi)

200

## Scrape on LinkedIn

In [34]:
with open('./data/linkedin.json') as f:
    jobs_linkedin = json.load(f)
for job in jobs_linkedin:
    job['profile'] = ''
    job['skills'] = []
    job['source'] = 'linkedin'

In [29]:
def insert_jobs(jobs, source='apec'):
    for job in jobs:
        annonce = Annonce(
            url = job['url'],
            title = job['title'],
            company_name = job['company'],
            location = job['location'],
            date = job['date'],
            descripiton = job['description'],
            poste = job['poste'],
            activity = job['activity'],
            profile = job['profile'],
            skills = '|'.join(job['skills']),
            contrat_id =  contracts.get(job['contrat'].strip().lower().split()[0], 5),
            source_id = sources[source],
        )
        annonce.create()

In [22]:
for i in Annonce.find_all(limit=1000):
    Annonce.delete(i.id)
insert_jobs(jobs_apec)
insert_jobs(jobs_pole_emploi, source='pole-emploie')

200

In [35]:
insert_jobs(jobs_linkedin, source='linkedin')

In [None]:
description

In [8]:
with open('./data/pole-emploi-response.json', 'w+') as f:
    json.dump(pole_emploi_response, f)

In [9]:
with open('./data/pole-emploi.json', 'w+') as f:
    json.dump(jobs_pole_emploi, f)

In [10]:
with open('./data/apec.json', 'w+') as f:
    json.dump(jobs_apec, f)

In [38]:
with open('./data/pole-emploi200.json', 'w+') as f:
    json.dump(jobs_pole_emploi, f)