In [12]:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
from random import randint

In [13]:
# Define a dictionary headers to store the User-Agent string for the request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}

In [14]:
# Read the player links from the csv file
df_links = pd.read_csv('output/jobs_links.csv')

# Convert the 'URL' column of the dataframe to a list
links = df_links['URL'].tolist()

In [15]:
list_of_row_dicts = []
for link in links:

    # Keep trying until the request is successful
    while True:
        try:
            # Make a GET request to the link
            request = rq.get(link,headers=headers)
        
            # Use BeautifulSoup to parse the HTML content of the page
            soup = bs(request.text, 'html.parser')

            # Find all <span> tags in the HTML
            title_spans = soup.find_all('span')
                       
            # Break out of the loop if the request is successful
            break
        except AttributeError:
            # Print error message and wait for 10 seconds before retrying
            print('Index Error : Sleeping for 10 seconds before retrying')
            sleep(10)

    try:
        mission = soup.find('h1').find('em').get_text(strip=True)
    except IndexError:
        mission = None
        print ('mission not found')

    try:
        h1 = soup.find('h1')
        if h1:
            # Supprime les balises <em> (comme "Mission freelance")
            for em in h1.find_all('em'):
                em.decompose()
        
            # Récupère le texte restant
            title = h1.get_text(strip=True)
    except IndexError:
        title = None
        print ('title not found')

    try:
        p = soup.find('p', class_='font-semibold text-sm')
        if p:
            # Récupère le texte restant
            company = p.get_text(strip=True)
    except IndexError:
        company = None
        print ('company not found')

    try:
        tag_container = soup.find('div', class_='flex items-center flex-wrap mt-2')
        skills = []
        
        if tag_container:
            for a in tag_container.find_all('a'):
                text = a.get_text(strip=True)
                if text:
                    skills.append(text)
    except IndexError:
        skills = None
        print ('skills not found')

    try:
        tjm = None
        date = None
        duration = None
        xp = None
        remote = None
        location = None
        
        for span in title_spans:
            svg = span.find('svg')
            if not svg:
                continue
            path = svg.find('path')
            if not path:
                continue
            d_attr = path.get('d', '')
            
            if 'M384 336a32 32' in d_attr:
                # Icone TJM
                value_span = span.find_next_sibling('span')
                if value_span:
                    tjm = value_span.get_text(strip=True)
            if 'M152 24c0-13.3' in d_attr:
                # Icone Date
                value_span = span.find_next_sibling('span')
                if value_span:
                    date = value_span.get_text(strip=True)
            if 'M464 256A208' in d_attr:
                # Icone Durée
                value_span = span.find_next_sibling('span')
                if value_span:
                    duration = value_span.get_text(strip=True)
            if 'M176 56V96H336V56c0-4.4-3.6-8-8-8H184c-4.4' in d_attr:
                # Icone XP 
                value_span = span.find_next_sibling('span')
                if value_span:
                    xp = value_span.get_text(strip=True)
            if 'M176 56V96H336V56c0-4.4-3.6-8-8-8H184c-4.4' in d_attr:
                # Icone Remote
                value_span = span.find_next_sibling('span')
                if value_span:
                    remote = value_span.get_text(strip=True)
            if 'M224.8 5.4c8.8-7.2 21.5-7.2' in d_attr:
                # Icone Remote
                value_span = span.find_next_sibling('span')
                if value_span:
                    remote = value_span.get_text(strip=True)
            if 'M320.7 249.2c-10.5' in d_attr:
                # Icone Remote
                value_span = span.find_next_sibling('span')
                if value_span:
                    location = value_span.get_text(strip=True)

    except IndexError:
        tjm = None
        print ('tjm not found')
    
    row_dic = {
    'JOB_URL' : link,
    'MISSION' : mission,
    'TITLE' : title,
    'COMPANY' : company,
    'SKILLS' : skills,
    'DATE' : date,
    'DURATION' : duration,
    'TJM' : tjm,
    'EXPERIENCE' : xp,
    'REMOTE' : remote,
    'LOCATION' : location
    }
    list_of_row_dicts.append(row_dic)
            
    print(mission,',',title,',',company,',',skills,',',date,',',duration,',',tjm,',',xp,',',remote,',',location)
    sleep(randint(1,3))

Mission freelance , Consultant SAP BASIS SOLMAN CHARM S/4 HANA , SAP-HIRE , ['SAP'] , 13/04/2025 , 4 mois , 550-750 €⁄j , > 10 ans d’expérience , Télétravail 100% , France
Mission freelance , UX/UI Designer , Amaris Consulting , ['Design system', 'Figma'] , Dès que possible , 8 mois , 450-550 €⁄j , 5 à 10 ans d’expérience , Télétravail 100% , Paris, France
Mission freelance , ELK Senior Consultant: Logboard, Stack Management, OpenTelemetry, Linux, Remote Working (h/f) , emagine Consulting SARL , [] , 20/04/2025 , 12 mois , 650-750 €⁄j , 5 à 10 ans d’expérience , Télétravail 100% , Paris, Île-de-France
Mission freelance , Consultant SAP ABAP S/4 PI/PO SCPI 100% Remote , WorldWide People , ['SAP'] , Dès que possible , 6 mois , 350-400 €⁄j , 5 à 10 ans d’expérience , Télétravail 100% , Montpellier, Occitanie
Mission freelance , Lead Architect: Initiative and Team Leadership, Microsoft, Azure, Office 365 Remote Working (h/f) , emagine Consulting SARL , [] , 05/05/2025 , 9 mois , 750-850 €⁄

In [16]:
df = pd.DataFrame(list_of_row_dicts)
df.to_csv('output/jobs_infos.csv', na_rep='None')