In [3]:
import concurrent.futures
import pandas as pd
import numpy as np
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import dateparser
import unidecode
import random
import time

In [4]:
def find_element_text_or_nan(driver, selector):
    """ Try to find and return the text of an element, or NaN if not found. """
    try:
        return driver.find_element(By.CSS_SELECTOR, selector).text
    except NoSuchElementException:
        return np.nan

def scrape_page(page_num, data_folder):
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    try:
        url = f"https://www.vie-publique.fr/discours?page={page_num}"
        driver.get(url)
        #time.sleep(random.uniform(1, 2))  # Random delay

        # Saving HTML content of the list page
        html_folder = os.path.join(data_folder, "html_page")
        os.makedirs(html_folder, exist_ok=True)
        html_filename = os.path.join(html_folder, f"html_page_{page_num}.html")
        with open(html_filename, "w", encoding="utf-8") as file:
            file.write(driver.page_source)

        # Process the page content
        process_page_content(driver, page_num, data_folder)
        
    except Exception as e:
        print(f"Error processing page {page_num}: {e}")
    finally:
        driver.quit()

def process_page_content(driver, page_num, data_folder):
    # Find all elements with the class 'fr-card__content'
    cards = driver.find_elements(By.CLASS_NAME, "fr-card__content")
    data = []

    # Process each card
    for card in cards:
        nature = find_element_text_or_nan(card, ".fr-card__start .field__item")
        title = find_element_text_or_nan(card, ".fr-card__title a")
        link = card.find_element(By.CSS_SELECTOR, ".fr-card__title a").get_attribute("href")

        try:
            date_text = card.find_element(By.CSS_SELECTOR, ".fr-card__end time").text
            date = dateparser.parse(date_text).strftime("%d/%m/%Y") if dateparser.parse(date_text) else np.nan
        except NoSuchElementException:
            date = np.nan

        data.append({"nature": nature, "title": title, "url": link, "date": date})

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Loop through each link to extract additional information
    for index, row in df.iterrows():
        driver.get(row["url"])

        # Sleep for a random duration between 1 and 5 seconds (let's hope I don't get banned=>toolongdontcare)
        #time.sleep(random.uniform(1, 2))

        # Save HTML content of the current link page
        link_html_folder = os.path.join(data_folder, "url_html_page")
        if not os.path.exists(link_html_folder):
            os.makedirs(link_html_folder)
        link_html_filename = os.path.join(link_html_folder, f"html_page_{page_num}_url_{index}.html")
        with open(link_html_filename, "w", encoding="utf-8") as file:
            file.write(driver.page_source)

        # Extract 'tag'
        try:
            tag = driver.find_element(By.CLASS_NAME, "vp-item-tag").text
        except NoSuchElementException:
            tag = "NA"
        df.at[index, 'tag'] = tag

        # Extract 'speaker'
        try:
            speakers_elements = driver.find_elements(By.CSS_SELECTOR, ".vp-intervenant .line-intervenant li")
            speakers = []
            for speaker_element in speakers_elements:
                speaker_name = speaker_element.find_element(By.CSS_SELECTOR, "a").text.strip()
                speaker_title = speaker_element.text.replace(speaker_name, '').strip(" -;")  # Remove the name and extra characters
                speaker_info = f"{speaker_name} - {speaker_title}" if speaker_title else speaker_name
                if speaker_info not in speakers:  # Check for duplicates
                    speakers.append(speaker_info)
            df.at[index, 'speaker'] = " ; ".join(speakers)
        except NoSuchElementException:
            df.at[index, 'speaker'] = "NA"

        # Check if 'vp-intervenant' exists and get its position
        try:
            intervenant_element = driver.find_element(By.CLASS_NAME, "vp-intervenant")
            intervenant_position = intervenant_element.location['y']
        except NoSuchElementException:
            intervenant_position = None

        # Find the last <p> tag after 'vp-intervenant'
        try:
            all_p_elements = driver.find_elements(By.CSS_SELECTOR, ".vp-discours-details p")
            for p_element in reversed(all_p_elements):
                if intervenant_position and p_element.location['y'] > intervenant_position:
                    last_p_text = p_element.text
                    break
            else:
                raise NoSuchElementException  # No <p> found after 'vp-intervenant'

            column_name = unidecode.unidecode(last_p_text.split(":")[0].strip().split()[0].lower()) # Remove accents + lower case
            column_value = last_p_text.split(":")[1].strip() if ":" in last_p_text else "NA"
            df.at[index, column_name] = column_value
        except (NoSuchElementException, IndexError):
            pass  # Do not create column if the element is absent or there is an error

        # Extract 'text'
        try:
            text = driver.find_element(By.CLASS_NAME, "field--name-field-texte-integral").text
        except NoSuchElementException:
            text = "NA"
        df.at[index, 'text'] = text

        # Extract 'keywords'
        try:
            keywords_elements = driver.find_elements(By.CLASS_NAME, "fr-tag--green-emeraude")
            keywords = " ; ".join([keyword.text for keyword in keywords_elements])
        except NoSuchElementException:
            keywords = "NA"
        df.at[index, 'keywords'] = keywords

    # Save data to CSV
    save_data_to_csv(data, page_num, data_folder)

def save_data_to_csv(data, page_num, data_folder):
    df = pd.DataFrame(data)
    csv_folder = os.path.join(data_folder, "csv_page")
    os.makedirs(csv_folder, exist_ok=True)
    csv_filename = os.path.join(csv_folder, f"csv_page_{page_num}.csv")
    df.to_csv(csv_filename, index=False)

# Main code
data_folder = "data"
start_page = 5000
end_page = 5050

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(scrape_page, page_num, data_folder) for page_num in range(start_page, end_page + 1)]
    concurrent.futures.wait(futures)
