In [1]:
import pandas as pd
import numpy as np
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import dateparser
import unidecode
import time
import random

# Chrome driver configuration
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

def find_element_text_or_nan(driver, selector):
    """ Try to find and return the text of an element, or NaN if not found. """
    try:
        return driver.find_element(By.CSS_SELECTOR, selector).text
    except NoSuchElementException:
        return "NA"

data_folder = "data"
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Loop through pages (insert numbers)
for page_num in range(11549, 11550):
    url = f"https://www.vie-publique.fr/discours?page={page_num}"
    driver.get(url)

    # Save HTML content of the list page
    html_folder = os.path.join(data_folder, "html_page")
    if not os.path.exists(html_folder):
        os.makedirs(html_folder)
    html_filename = os.path.join(html_folder, f"html_page_{page_num}.html")
    with open(html_filename, "w", encoding="utf-8") as file:
        file.write(driver.page_source)

    # Find all elements with the class 'fr-card__content'
    cards = driver.find_elements(By.CLASS_NAME, "fr-card__content")

    # List to store initial data
    data = []

    for card in cards:
        nature = find_element_text_or_nan(card, ".fr-card__start .field__item")
        title = find_element_text_or_nan(card, ".fr-card__title a")
        link = card.find_element(By.CSS_SELECTOR, ".fr-card__title a").get_attribute("href")

        try:
            date_text = card.find_element(By.CSS_SELECTOR, ".fr-card__end time").text
            date = dateparser.parse(date_text).strftime("%d/%m/%Y") if dateparser.parse(date_text) else "NA"
        except NoSuchElementException:
            # If <time> tag is not found, try with the <p> tag
            try:
                date_text = card.find_element(By.CSS_SELECTOR, ".fr-card__end p.vp-date-box").text
                date = dateparser.parse(date_text).strftime("%d/%m/%Y") if dateparser.parse(date_text) else "NA"
            except NoSuchElementException:
                # If no date is found, set to NA
                date = "NA"

        data.append({"nature": nature, "title": title, "url": link, "date": date})

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Loop through each link to extract additional information
    for index, row in df.iterrows():
        driver.get(row["url"])

        # Sleep for a random duration between 1 and 5 seconds (let's hope I don't get banned=>toolongdontcare)
        #time.sleep(random.uniform(1, 2))

        # Save HTML content of the current link page
        link_html_folder = os.path.join(data_folder, "url_html_page")
        if not os.path.exists(link_html_folder):
            os.makedirs(link_html_folder)
        link_html_filename = os.path.join(link_html_folder, f"html_page_{page_num}_url_{index}.html")
        with open(link_html_filename, "w", encoding="utf-8") as file:
            file.write(driver.page_source)

        # Extract 'tag'
        try:
            tag = driver.find_element(By.CLASS_NAME, "vp-item-tag").text
        except NoSuchElementException:
            tag = "NA"
        df.at[index, 'tag'] = tag

        # Extract 'speaker'
        try:
            speakers_elements = driver.find_elements(By.CSS_SELECTOR, ".vp-intervenant .line-intervenant li")
            speakers = []
            for speaker_element in speakers_elements:
                speaker_name = speaker_element.find_element(By.CSS_SELECTOR, "a").text.strip()
                speaker_title = speaker_element.text.replace(speaker_name, '').strip(" -;")  # Remove the name and extra characters
                speaker_info = f"{speaker_name} - {speaker_title}" if speaker_title else speaker_name
                if speaker_info not in speakers:  # Check for duplicates
                    speakers.append(speaker_info)
            df.at[index, 'speaker'] = " ; ".join(speakers)
        except NoSuchElementException:
            df.at[index, 'speaker'] = "NA"

        # Check if 'vp-intervenant' exists and get its position
        try:
            intervenant_element = driver.find_element(By.CLASS_NAME, "vp-intervenant")
            intervenant_position = intervenant_element.location['y']
        except NoSuchElementException:
            intervenant_position = None

        # Find the last <p> tag after 'vp-intervenant'
        try:
            all_p_elements = driver.find_elements(By.CSS_SELECTOR, ".vp-discours-details p")
            for p_element in reversed(all_p_elements):
                if intervenant_position and p_element.location['y'] > intervenant_position:
                    last_p_text = p_element.text
                    break
            else:
                raise NoSuchElementException  # No <p> found after 'vp-intervenant'

            column_name = unidecode.unidecode(last_p_text.split(":")[0].strip().split()[0].lower()) # Remove accents + lower case
            column_value = last_p_text.split(":")[1].strip() if ":" in last_p_text else "NA"
            df.at[index, column_name] = column_value
        except (NoSuchElementException, IndexError):
            pass  # Do not create column if the element is absent or there is an error

        # Extract 'text'
        try:
            # First, try with the original class name
            text = driver.find_element(By.CLASS_NAME, "field--name-field-texte-integral").text
        except NoSuchElementException:
            try:
                # If the original class name is not found, try with the shortened alternate class name
                text = driver.find_element(By.CLASS_NAME, "field--name-field-resume").text
            except NoSuchElementException:
                # If neither class name is found, set text to numpy NaN
                text = "NA"

        # Extract 'keywords'
        try:
            keywords_elements = driver.find_elements(By.CLASS_NAME, "fr-tag--green-emeraude")
            keywords = " ; ".join([keyword.text for keyword in keywords_elements])
        except NoSuchElementException:
            keywords = "NA"
        df.at[index, 'keywords'] = keywords

    # Save the DataFrame in CSV in the 'data/csv_page' subfolder
    csv_folder = os.path.join(data_folder, "csv_page")
    if not os.path.exists(csv_folder):
        os.makedirs(csv_folder)
    csv_filename = os.path.join(csv_folder, f"csv_page_{page_num}.csv")
    df.to_csv(csv_filename, index=False)

# Close the browser
driver.quit()

Let's display the dataframe to see if it's working

In [6]:
df

Unnamed: 0,nature,title,url,date,tag,speaker,text,keywords,circonstance
0,Communiqué,Communiqué du ministère des affaires étrangère...,https://www.vie-publique.fr/discours/127937-co...,12/12/1986,Aide France,,L'évolution de la situation au Tchad se caract...,Aide France ; Aide militaire,
1,Communiqué,Communiqué final diffusé à l'issue de la réuni...,https://www.vie-publique.fr/discours/208888-co...,12/12/1986,International,,Communiqué final\n- Le conseil de l'Atlantique...,International ; Relations internationales ; Di...,
2,Conférence de presse,"Déclarations de M. Adrien Zeller, secrétaire d...",https://www.vie-publique.fr/discours/217221-de...,11/12/1986,Société,Adrien Zeller,,Société ; Santé - Protection sociale ; Assuran...,Réunion du Haut conseil médical de la sécurité...
3,Interview,"Interview de M. Bernard Pons, ministre des DOM...",https://www.vie-publique.fr/discours/215287-in...,11/12/1986,,Bernard Pons,,,"Déplacement officiel de M. Bernard Pons, minis..."
4,Communiqué,Communiqué du ministère des affaires étrangère...,https://www.vie-publique.fr/discours/208890-co...,11/12/1986,International,,Le ministre de la coopération de la République...,International ; Relations internationales ; Re...,
5,Communiqué,Communiqué du ministère des affaires étrangère...,https://www.vie-publique.fr/discours/208889-co...,11/12/1986,International,,"M. Michel Aurillac, ministre de la coopération...",International ; Relations internationales ; Re...,
6,Déclaration,"Discours de M. Adrien Zeller, secrétaire d'Eta...",https://www.vie-publique.fr/discours/217233-di...,11/12/1986,,Adrien Zeller,,,Remise des insignes de chevalier de la légion ...
7,Déclaration,"Allocution de M. François Mitterrand, Présiden...",https://www.vie-publique.fr/discours/138981-al...,11/12/1986,International,François Mitterrand,"Monsieur le président,\n- Madame,\n- C'est pou...",International ; Relations internationales ; Re...,Visite officielle en France du président égypt...
8,Communiqué,"Déclaration de M. Jean-Marie Spaeth, secrétair...",https://www.vie-publique.fr/discours/245436-de...,11/12/1986,,Jean-Marie Spaeth,,,Adoption du projet de loi portant diverses dis...
9,Déclaration,"Allocution de M. Jacques Chirac, Premier minis...",https://www.vie-publique.fr/discours/252859-al...,11/12/1986,International,Jacques Chirac,,International ; Relations internationales ; Re...,Visite officielle du président égyptien Moubar...


## Verify missing files

In [2]:
# Set the path to the folder containing your CSV files
folder_path = 'data\csv_page'  # Replace with your actual folder path
output_file = 'missing_files.txt'  # Name of the output file to store missing file names

try:
    # List all files in the directory
    file_list = os.listdir(folder_path)

    # Extract the numbers from the file names and sort them
    file_numbers = sorted(
        [int(f.replace('csv_page_', '').replace('.csv', '')) 
         for f in file_list if f.startswith('csv_page') and f.endswith('.csv')]
    )

    # Find the missing files in the sequence
    max_number = max(file_numbers) if file_numbers else 0
    missing_files = [
        f'csv_page_{i}.csv' for i in range(1, max_number + 1) if i not in file_numbers
    ]

    # Write the missing files to a text file
    with open(output_file, 'w') as file:
        for file_name in missing_files:
            file.write(file_name + '\n')

    # Output message
    if missing_files:
        print(f"Missing files written to {output_file}")
    else:
        print("No missing files found.")

except FileNotFoundError as e:
    print(f"Error: {e}")


No missing files found.
