In [1]:
import requests
from utils.api import get_pole_emploi_access_token
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from utils.entities import Annonce
import json

In [2]:
sources = { 'apec': 1, 'pole-emploie': 2, 'linkedin': 3 }
contracts = {'cdi': 1, 'cdd': 2, 'stage': 3, 'alternance': 4}

## Scrape on APEC

In [12]:
# Scrape on apec
driver = webdriver.Chrome()
CSS = By.CSS_SELECTOR
job_page_url = 'https://www.apec.fr/candidat/recherche-emploi.html/emploi?motsCles=data&typesConvention=143684&typesConvention=143685&typesConvention=143686&typesConvention=143687'
jobs_apec = []
for i in range(7):
    driver.get(f'{job_page_url}&page={i}')
    if(i == 0):
        try:
            accept_cookies = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler"))
            )
        except:
            driver.quit()

        accept_cookies.click()
    job_links = driver.find_elements(CSS, '.container-result div > a')

    # Get url, title and company name
    job_links = [
        (
            job_link.get_attribute('href'),
            job_link.find_element(CSS, 'h2.card-title').text,
            job_link.find_element(CSS, 'p.card-offer__company').text,
            job_link.find_element(CSS, 'li[title="Date de publication"]').text
        )
        for job_link in job_links
    ]

    # Loop through job links
    for url, title, company, date in job_links:
        
        # Open the job page
        driver.get(url)
        
        # Wait before looking for elements
        try:
            details = WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((CSS, 'ul.details-offer-list.mb-20 > li'))
            )
        except:
            print("quiting")
            driver.quit()
        
        # entreprise
        company = details[0].text
        # type de contrat
        contrat = details[1].find_element(CSS, 'span').text
        # lieu de travail
        location = details[2].text
        # descripiton
        descripiton = driver.find_element(CSS, '.details-post > p').text
        # profil recherché
        profile = driver.find_element(CSS, '.details-post > p:nth-child(4)').text
        detail_posts = driver.find_elements(CSS, 'apec-poste-informations > .row.border-T > .col-lg-4 > .details-post')
        # Secteur d’activité du poste
        activity = detail_posts[6].find_element(CSS, 'span').text
        # Le nom du poste
        poste = detail_posts[3].find_element(CSS, 'span').text
        
        # competences
        competences = driver.find_elements(CSS, '.details-post .added-skills-container')
        try:
            see_more = competences[2].find_element(CSS, '.added-skills-language + p.m-0')
            see_more.click()
            hard_skills = competences[2].find_elements(CSS, '.added-skills-language')
            hard_skills = [skill.text for skill in hard_skills]
        except:
            hard_skills = []

        # Print or store the job details
        jobs_apec.append({
            'url': url,
            'title': title,
            'company': company,
            'contrat': contrat,
            'location': location,
            'date': date,
            'descripiton': descripiton,
            'profile': profile,
            'skills': hard_skills,
            'poste': poste,
            'activity': activity
        })
# Close the webdriver
driver.quit()
len(jobs_apec)

60

## Scrape on Pôle emploi

In [13]:
# access_token = get_pole_emploi_access_token()
access_token = 'BjcdFWw9reTWPcohE2moZQckezU'
header = {'Authorization': f'Bearer {access_token}'}
response = requests.get('https://api.pole-emploi.io/partenaire/offresdemploi/v2/offres/search?range=0-100&motsCles=data', headers=header)
print(response.json())
jobs_pole_emploi = []
pole_emploi_response = response.json()['resultats']
for job in pole_emploi_response:

    jobs_pole_emploi.append({
        'url': job['origineOffre']['urlOrigine'],
        'title': job['intitule'],
        'company': job['entreprise'].get('nom', 'Unknown'),
        'contrat': job['typeContrat'],
        'location': job['lieuTravail']['libelle'],
        'date': job['dateCreation'],
        'descripiton': job['description'],
        'profile': '',
        'skills': [c['libelle'] for c in job.get('competences', [])],
        'poste': job['appellationlibelle'],
        'activity': job['secteurActiviteLibelle']
    })
len(jobs_pole_emploi)

{'resultats': [{'id': '166SVWH', 'intitule': 'Data Scientist junior (H/F)', 'description': "Lunalogic est un cabinet de conseil créé en 2000, présent à Paris, Londres, Hong-Kong et Casablanca spécialisé sur l'ensemble de la chaîne de valeur financière. Les expertises du cabinet sont reconnues par les plus grands établissements financiers et reposent sur une politique de recrutement sélective (grandes écoles d'ingénieurs et universités reconnues). Ces expertises recouvrent :\n- Les Risques et l'Analyse quantitative \n- La Data science et les nouvelles technologies au service de la finance\nEngagés dans une démarche de développement, de qualité et de performance, nous renforçons notre pôle Finance et recrutons :\n\nUn(e) Consultant(e) Data Scientist \n\nNous recherchons un(e) Data Scientist junior talentueux et passionné(e) pour rejoindre une institution financière.\nLe profil recherché pour la mission a validé un diplôme d'ingénieur Science des données et de décision.\nLes compétences r

101

In [6]:
def insert_jobs(jobs, source='apec'):
    for job in jobs:
        annonce = Annonce(
            url = job['url'],
            title = job['title'],
            company_name = job['company'],
            location = job['location'],
            date = job['date'],
            descripiton = job['descripiton'],
            poste = job['poste'],
            activity = job['activity'],
            profile = job['profile'],
            skills = '|'.join(job['skills']),
            contrat_id =  contracts.get(job['contrat'].strip().lower().split()[0], 5),
            source_id = sources[source],
        )
        annonce.create()

In [7]:
for i in Annonce.find_all(limit=300):
    Annonce.delete(i.id)
insert_jobs(jobs_apec)
insert_jobs(jobs_pole_emploi, source='pole-emploie')

In [8]:
with open('./data/pole-emploi-response.json', 'w+') as f:
    json.dump(pole_emploi_response, f)

In [9]:
with open('./data/pole-emploi.json', 'w+') as f:
    json.dump(jobs_pole_emploi, f)

In [10]:
with open('./data/apec.json', 'w+') as f:
    json.dump(jobs_apec, f)