In [69]:
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import os
import json
from tqdm import tqdm

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Set up Splinter
# service = webdriver.chrome.service.Service(ChromeDriverManager().install())
browser = Browser('chrome', headless=False)

In [70]:
def get_insee_url(taille=1000, debut=0, categorie=2):
    return f'https://www.insee.fr/fr/statistiques?taille={taille}&debut={debut}&categorie={categorie}'

# Generate array of URLs for all 27000 studies
total_studies = 27000
page_size = 1000  # Maximum number of results per page
urls = []

for offset in range(0, total_studies, page_size):
    url = get_insee_url(taille=page_size, debut=offset, categorie=2)
    urls.append(url)
    
print(f"Generated {len(urls)} URLs to fetch all {total_studies} studies")

Generated 27 URLs to fetch all 27000 studies


In [74]:
# Wait for the table with id "documents" to appear
all_articles = []  # Initialize the main array to store all article data

for url in urls:
    try:
        # Reset variables for each URL
        max_attempts = 10
        attempts = 0
        table_found = False
        url_articles = []  # Articles found on this specific URL
        
        browser.visit(url)
        time.sleep(15)
        while attempts < max_attempts and not table_found:
            soup = BeautifulSoup(browser.html, 'html.parser')
            table = soup.find('table', {'id': 'documents'})
        
            if table:
                table_found = True
                # print("Table with ID 'documents' found!")
                
                # Find all rows with class "cliquable"
                cliquable_rows = table.find_all('tr')
                # print(f"Found {len(cliquable_rows)} cliquable rows")
                
                # Extract and print data from each row
                for row in cliquable_rows:
                    row_id = row.get('id')
                    row_content = {}
                    if row_id:
                        # Extract data from the row
                        row_content['id'] = row_id
                        
                        # Extract link
                        link_element = row.select_one('a.echo-lien')
                        if link_element:
                            row_content['url'] = link_element.get('href')
                        
                        # Extract category and date
                        category_element = row.select_one('.echo-categorie-publication')
                        date_element = row.select_one('.echo-date')
                        if category_element:
                            row_content['category'] = category_element.text.strip()
                        if date_element:
                            row_content['date'] = date_element.text.strip()
                        
                        # Extract title
                        title_element = row.select_one('.echo-titre')
                        if title_element:
                            row_content['title'] = title_element.text.strip()
                        
                        # Extract summary
                        summary_element = row.select_one('.echo-chapo')
                        if summary_element:
                            row_content['summary'] = summary_element.text.strip()
                        
                        # Add the row data to the URL articles array
                        url_articles.append(row_content)
                        
                        # print(f"Saved data for article {row_id} to data/articles.json")
            else:
                print(f"Waiting for table to appear... (Attempt {attempts+1}/{max_attempts})")
                time.sleep(2)  # Wait 2 seconds before checking again
                attempts += 1

        if not table_found:
            print(f"Table with ID 'documents' did not appear for URL: {url}")
        else:
            # Add this URL's articles to our main list
            all_articles.extend(url_articles)
            print(f"Found {len(url_articles)} articles on {url}")

        # Save the data to a JSON file after processing each URL
        # First check if file exists to determine if we're appending
        existing_articles = []
        if os.path.exists('data/articles.json'):
            with open('data/articles.json', 'r', encoding='utf-8') as f:
                existing_articles = json.load(f)
            
        # Combine existing articles with new ones, avoiding duplicates
        article_ids = {article['id'] for article in existing_articles}
        new_articles_added = 0
        
        for article in all_articles:
            if article['id'] not in article_ids:
                existing_articles.append(article)
                article_ids.add(article['id'])
                new_articles_added += 1
                
        # Write the combined data back to the file
        with open('data/articles.json', 'w', encoding='utf-8') as f:
            json.dump(existing_articles, f, ensure_ascii=False, indent=4)
    except Exception as e:
        print(f"Error processing URL {url}: {str(e)}")

    print(f'Saved all articles. Total length: {len(existing_articles)} (Added {new_articles_added} new articles)')

Found 1000 articles on https://www.insee.fr/fr/statistiques?taille=1000&debut=0&categorie=2
Saved all articles. Total length: 1000 (Added 1000 new articles)
Found 1000 articles on https://www.insee.fr/fr/statistiques?taille=1000&debut=1000&categorie=2
Saved all articles. Total length: 2000 (Added 1000 new articles)
Found 1000 articles on https://www.insee.fr/fr/statistiques?taille=1000&debut=2000&categorie=2
Saved all articles. Total length: 3000 (Added 1000 new articles)
Found 1000 articles on https://www.insee.fr/fr/statistiques?taille=1000&debut=3000&categorie=2
Saved all articles. Total length: 4000 (Added 1000 new articles)
Found 1000 articles on https://www.insee.fr/fr/statistiques?taille=1000&debut=4000&categorie=2
Saved all articles. Total length: 5000 (Added 1000 new articles)
Found 1000 articles on https://www.insee.fr/fr/statistiques?taille=1000&debut=5000&categorie=2
Saved all articles. Total length: 6000 (Added 1000 new articles)
Found 1000 articles on https://www.insee.fr

In [77]:
# Load the articles data from the JSON file
with open('data/articles.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(len(data))

if not os.path.exists('data/soup'):
    os.makedirs('data/soup')


# Visit each article URL and save the HTML content
for article in tqdm(data, desc="Processing articles"):
    article_id = article['id']
    article_url = 'https://www.insee.fr' + article['url']

    if '/fr/statistiques' not in article_url:
        continue
    
    # Skip if already downloaded
    if os.path.exists(f'data/soup/{article_id}.html'):
        continue
    
    try:
        browser.visit(article_url)
        
        # Wait for title element to appear
        max_attempts = 300
        attempts = 0
        title_found = False
        
        while attempts < max_attempts and not title_found:
            title_element = browser.find_by_css('.titre-titre')
            if title_element:
                title_found = True
            else:
                time.sleep(0.1)  # Wait a short time before checking again
                attempts += 1
        time.sleep(1)
        # Get the HTML content
        html_content = browser.html
        
        # Save the HTML content to a file
        with open(f'data/soup/{article_id}.html', 'w', encoding='utf-8') as f:
            f.write(html_content)
        
    except Exception as e:
        print(f"Error processing article {article_id}: {str(e)}")

26014


Processing articles:   4%|▍         | 1121/26014 [52:03<19:16:09,  2.79s/it]


KeyboardInterrupt: 

## Open all pages with soup

In [78]:
import os
import glob
from bs4 import BeautifulSoup

# List all HTML files in the data/soup directory
soup_files = glob.glob('data/soup/*.html')

# Create an array to store all soup objects
all_soups = []

# Process each HTML file
for file_path in tqdm(soup_files, desc="Loading HTML files"):
    try:
        # Read the HTML content
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Add the soup object to the array along with the file ID
        article_id = os.path.basename(file_path).replace('.html', '')
        all_soups.append({
            'id': article_id,
            'soup': soup
        })
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")

print(f"Loaded {len(all_soups)} HTML files as soup objects")


Loading HTML files: 100%|██████████| 1111/1111 [01:26<00:00, 12.88it/s]

Loaded 1111 HTML files as soup objects





## Extract functions

### Function to list all children elements from parent

In [112]:
from tqdm import tqdm

def analyze_element_distribution(all_soups, selector, max_depth=1):
    """
    Analyze the distribution of HTML elements across all soups.
    
    Args:
        all_soups: List of all soup objects
        selector: CSS selector to find elements
        max_depth: Maximum depth of children to display (default: 1)
    """
    element_counts = {}
    element_first_id = {}  # Store the first ID for each element type
    
    def count_elements(element, current_depth=0, path="", soup_id=None):
        if current_depth >= max_depth:
            return
            
        element_key = f"{path}:{element.name}:{','.join(element.get('class', []))}"
        element_counts[element_key] = element_counts.get(element_key, 0) + 1
        
        # Store the first soup ID where this element appears
        if element_key not in element_first_id and soup_id:
            element_first_id[element_key] = soup_id
        
        for child in element.children:
            if child.name is not None:  # Skip NavigableString objects
                child_path = f"{path}/{child.name}"
                count_elements(child, current_depth + 1, child_path, soup_id)
    
    # Process all soups with tqdm progress bar
    for soup_obj in tqdm(all_soups, desc="Analyzing soups"):
        soup = soup_obj['soup']
        soup_id = soup_obj['id']
        elements = soup.select(selector)
        
        for element in elements:
            count_elements(element, 0, element.name, soup_id)
    
    # Print results sorted by frequency
    print(f"Element distribution for selector '{selector}' across {len(all_soups)} pages:")
    for element, count in sorted(element_counts.items(), key=lambda x: x[1], reverse=True):
        first_id = element_first_id.get(element, "N/A")
        print(f"{element}: {count} ({count/len(all_soups)*100:.1f}%) - First ID: {first_id}")

# Example usage
# analyze_element_distribution([all_soups[0]], 'main.main')

# Premier pave bleu .titre-page


# Analyze across all soups
print("\nAnalyzing across all soups:")
analyze_element_distribution(all_soups, 'div#contenu.contenu', max_depth=1)


Analyzing across all soups:


Analyzing soups: 100%|██████████| 1111/1111 [00:04<00:00, 225.84it/s]

Element distribution for selector 'div#contenu.contenu' across 1111 pages:
div:div:contenu,template-impression,impression-nouvelle: 481 (43.3%) - First ID: 8205287
div:div:contenu,information-rapide,template-impression,impression-nouvelle: 275 (24.8%) - First ID: 8259140
div:div:contenu: 65 (5.9%) - First ID: 8264844





In [142]:
def process_graphique_element(graphique, target_id, index=0):
    """
    Process a graphique element, display and save its content.
    
    Args:
        graphique: BeautifulSoup element representing the graphique
        target_id: ID of the parent document for file naming
        index: Index of the graphique for naming (default: 0)
    
    Returns:
        Dictionary with extracted information about the graphique
    """
    import os
    import textwrap
    import re
    import requests
    from IPython.display import SVG, Image, display
    
    # Extract title more robustly - look in parent containers if needed
    title_element = graphique.select_one('h3, h2, figcaption, .title')
    if not title_element:
        # Look in parent containers for title
        parent = graphique.parent
        for _ in range(3):  # Check up to 3 levels up
            if parent:
                title_element = parent.select_one('h3, h2, figcaption, .title')
                if title_element:
                    break
                parent = parent.parent
    
    title_text = title_element.get_text().strip() if title_element else f"Graphique {index+1}"
    
    # Extract all paragraphs that might contain captions
    all_text_elements = graphique.find_all(['p', 'div.note', 'div.caption', 'figcaption'])
    
    # If nothing found in direct children, look in siblings or parent's children
    if not all_text_elements:
        # Try siblings
        for sibling in graphique.find_next_siblings(['div', 'p', 'figcaption']):
            all_text_elements.append(sibling)
        
        # If still nothing, try parent's children
        if not all_text_elements and graphique.parent:
            parent_elements = graphique.parent.find_all(['p', 'div.note', 'div.caption', 'figcaption'])
            for elem in parent_elements:
                if elem != graphique:
                    all_text_elements.append(elem)
    
    # Process text elements to extract lecture, champ, source
    captions = []
    for elem in all_text_elements:
        text = elem.get_text().strip()
        if text and not elem.find('svg'):  # Skip if it contains SVG
            captions.append(text)
    
    caption_text = "\n".join(captions)
    
    # Create directory if it doesn't exist
    os.makedirs('saved_images', exist_ok=True)
    
    result = {
        'title': title_text,
        'captions': caption_text,
        'files': []
    }
    
    # Try to display visual content separately if needed
    svg_content = graphique.select_one('svg')
    if svg_content:
        
        # Save SVG with title and caption included
        # Get the SVG as string
        svg_str = str(svg_content)
        
        # Parse SVG to get its viewBox dimensions
        viewbox_match = re.search(r'viewBox=["\']([\d\s.-]+)["\']', svg_str)
        if viewbox_match:
            viewbox = viewbox_match.group(1)
            viewbox_values = [float(v) for v in viewbox.split()]
            svg_width = viewbox_values[2]
            svg_height = viewbox_values[3]
        else:
            # Default dimensions if viewBox not found
            svg_width = 1000
            svg_height = 600
        
        # Calculate additional height needed for title and captions
        title_lines = len(textwrap.wrap(title_text, width=60))
        caption_lines = len(caption_text.split('\n'))
        
        total_height = svg_height + (title_lines * 30) + (caption_lines * 25) + 100  # Extra padding
        
        # Create a new SVG with title and caption
        svg_with_title = f'''<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" width="{svg_width}" height="{total_height}" viewBox="0 0 {svg_width} {total_height}">
  <!-- Title and Caption -->
  <style>
    .title {{ font-family: Arial, sans-serif; font-size: 18px; font-weight: bold; }}
    .caption {{ font-family: Arial, sans-serif; font-size: 14px; }}
  </style>
  
  <!-- Title with wrapping -->
'''
        
        # Add wrapped title
        wrapped_title = textwrap.wrap(title_text, width=60)
        for idx, line in enumerate(wrapped_title):
            svg_with_title += f'  <text x="50" y="{30 + idx * 25}" class="title">{line}</text>\n'
        
        # Calculate position for the original SVG content
        title_height = 20 + (len(wrapped_title) * 25)
        
        # Extract just the content of the original SVG (without the outer <svg> tags)
        svg_content_only = svg_str[svg_str.find('<svg')+4:svg_str.rfind('</svg>')]
        
        svg_with_title += f'''
  <!-- Original SVG content -->
  <g transform="translate(0, {title_height})">
    {svg_content_only}
  </g>
  
  <!-- Captions -->
'''
        
        # Add captions with wrapping
        y_pos = title_height + svg_height + 30
        for caption in caption_text.split('\n'):
            if caption.strip():
                wrapped_caption = textwrap.wrap(caption, width=90)
                for line in wrapped_caption:
                    svg_with_title += f'  <text x="50" y="{y_pos}" class="caption">{line}</text>\n'
                    y_pos += 20
                y_pos += 10  # Extra spacing between paragraphs
        
        svg_with_title += '</svg>'
        
        # Save the enhanced SVG
        filename = f'saved_images/graphique_{target_id}_{index+1}_with_title.svg'
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(svg_with_title)
        
        result['files'].append(filename)
    
    # If it contains an image instead of SVG
    img_content = graphique.select_one('img')
    if img_content and 'src' in img_content.attrs:
        img_src = img_content['src']
        # Handle relative URLs if needed
        if img_src.startswith('/'):
            img_src = f"https://dares.travail-emploi.gouv.fr{img_src}"
        
        # Download and save the image
        response = requests.get(img_src)
        if response.status_code == 200:
            filename = f'saved_images/image_{target_id}_{index+1}.png'
            with open(filename, 'wb') as f:
                f.write(response.content)
            result['files'].append(filename)
    
    return result


# Find the soup with id 8380766
target_id = '8380766'
target_soup = next((soup_obj for soup_obj in all_soups if soup_obj['id'] == target_id), None)

# Display the target soup
if target_soup:
    # Check if the soup contains any graphique element
    graphique_elements = target_soup['soup'].select('div[id^="graphique-figure"]')
    
    if graphique_elements:
        for i, graphique in enumerate(graphique_elements):
            process_graphique_element(graphique, target_id, i)

In [152]:
# Find the soup with id 8380766
target_id = '8380766'
target_soup = next((soup_obj for soup_obj in all_soups if soup_obj['id'] == target_id), None)['soup']
target_soup

<html class="safari" lang="fr" mozdisallowselectionprint="" moznomarginboxes=""><head>
<script async="" src="https://cdn.matomo.cloud/insee.matomo.cloud/container_j86K86K5.js"></script><script>
        var _mtm = window._mtm || [];
        var _paq = window._paq || [];
        _mtm.push({ 'mtm.startTime': (new Date().getTime()), event: 'mtm.Start' });
        _paq.push(['HeatmapSessionRecording::disable']);
        (function() {
            var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
            g.async=true; g.src='https://cdn.matomo.cloud/insee.matomo.cloud/container_j86K86K5.js'; s.parentNode.insertBefore(g,s);
        })();
    </script>
<title>Les pensions de retraite des femmes inférieures de 31 % à celles des hommes - Insee Analyses Provence-Alpes-Côte d'Azur - 141</title>
<meta content="Fin 2020 en Provence-Alpes-Côte d’Azur, près de 1,4 million de personnes perçoivent une pension de retraite. Les femmes, qui représentent plus de la moiti

In [None]:



# Find the soup with id 8380766
target_id = '8380766'
target_soup = next((soup_obj for soup_obj in all_soups if soup_obj['id'] == target_id), None)['soup'].select('div[id^="tableau-figure"]')
target_soup


markdown_table = html_table_to_markdown(html)
print(markdown_table)


### Article info

In [None]:
# Function to extract article information from the page
def extract_article_info(soup):
    article_info = {}
    
    # Extract title
    title_element = soup.select_one('.titre-titre')
    if title_element:
        article_info['title'] = title_element.text.strip()
    
    # Extract authors
    authors_element = soup.select_one('.auteurs')
    if authors_element:
        article_info['authors'] = authors_element.text.strip()
    
    # Extract description/chapeau
    description_element = soup.select_one('.paragraphe-chapeau')
    if description_element:
        article_info['description'] = description_element.text.strip()
    
    # Extract category
    category_element = soup.select_one('.categorie')
    if category_element:
        article_info['category'] = category_element.text.strip()
    
    # Extract number
    number_element = soup.select_one('.famille-numero')
    if number_element:
        article_info['number'] = number_element.text.strip()
    
    # Extract date
    date_element = soup.select_one('.date-diffusion')
    if date_element:
        # Extract only the date part, not the "Paru le" text
        date_text = date_element.text.strip()
        # Remove "Paru le" if present
        if "Paru le" in date_text:
            date_text = date_text.split("Paru le")[-1].strip()
        article_info['date'] = date_text
    
    # Extract link to collection
    collection_link = soup.select_one('.lien-details a')
    if collection_link:
        article_info['collection_link'] = 'https://www.insee.fr' + collection_link['href']
    
    # Extract downloadable files
    downloadable_files = []
    for file_element in soup.select('.donnee-telechargeable'):
        file_info = {}
        
        # Extract file name/type
        libelle_element = file_element.select_one('.libelle')
        if libelle_element:
            file_info['name'] = libelle_element.text.strip()
        
        # Extract file size
        size_element = file_element.select_one('.taille')
        if size_element:
            file_info['size'] = size_element.text.strip().replace('(', '').replace(')', '')
        
        # Extract file link
        link_element = file_element.find_parent('a')
        if link_element:
            file_info['link'] = 'https://www.insee.fr' + link_element['href']
        
        # Extract file type (pdf, xlsx, etc.)
        if 'link' in file_info:
            file_extension = file_info['link'].split('.')[-1]
            file_info['type'] = file_extension
        
        downloadable_files.append(file_info)
    
    article_info['downloadable_files'] = downloadable_files
    
    return article_info

### Convert table to markdown

In [173]:
def html_table_to_markdown(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    table = soup.find('table')
    
    if not table:
        return "No table found in the HTML."
    
    # Initialize the grid structure
    grid = []
    max_cols = 0
    
    # First pass: determine table dimensions and create grid
    rows = table.find_all('tr')
    # Track cells that are occupied due to rowspan
    occupied_cells = {}  # Format: {(row_idx, col_idx): True}
    
    # Process all rows to build the grid
    for row_idx, row in enumerate(rows):
        grid_row = []
        col_idx = 0
        
        cells = row.find_all(['th', 'td'])
        for cell in cells:
            # Skip occupied positions (from previous rowspans)
            while (row_idx, col_idx) in occupied_cells:
                grid_row.append(None)  # Placeholder for occupied cell
                col_idx += 1
            
            # Get cell content
            content = cell.get_text().strip().replace('\n', ' ')
            
            # Get rowspan and colspan (default to 1)
            rowspan = int(cell.get('rowspan', 1))
            colspan = int(cell.get('colspan', 1))
            
            # Add the content to the current position
            grid_row.append(content)
            
            # Mark positions as occupied for rowspan/colspan
            for r in range(rowspan):
                for c in range(colspan):
                    if r == 0 and c == 0:
                        continue  # Skip the current cell
                    occupied_pos = (row_idx + r, col_idx + c)
                    occupied_cells[occupied_pos] = True
            
            # Move column index forward by colspan
            col_idx += colspan
        
        # Update maximum columns if needed
        max_cols = max(max_cols, col_idx)
        grid.append(grid_row)
    
    # Second pass: normalize the grid (fill None values with empty strings)
    normalized_grid = []
    for row in grid:
        normalized_row = row + ['' for _ in range(max_cols - len(row))]
        normalized_grid.append(normalized_row)
    
    # Convert to markdown
    markdown_table = []
    
    # Determine if first row is header
    has_header = any(cell.name == 'th' for cell in rows[0].find_all(['th', 'td'])) if rows else False
    
    # Add rows to markdown table
    for i, row in enumerate(normalized_grid):
        markdown_row = '| ' + ' | '.join([str(cell) if cell is not None else '' for cell in row]) + ' |'
        markdown_table.append(markdown_row)
        
        # Add separator after header
        if i == 0 and has_header:
            markdown_table.append('| ' + ' | '.join(['---' for _ in range(max_cols)]) + ' |')
    
    return '\n'.join(markdown_table)


def extract_table_with_metadata(tableau_figure):
    """
    Extract title, table in markdown format, and caption from a tableau-figure div
    
    Args:
        tableau_figure: BeautifulSoup object representing a div with id starting with 'tableau-figure'
    
    Returns:
        dict: Dictionary containing title, table in markdown, and caption
    """
    result = {}
    
    # Extract title
    title_element = tableau_figure.select_one('.titre-figure')
    if title_element:
        result['title'] = title_element.text.strip()
    
    # Extract table and convert to markdown
    table_element = tableau_figure.select_one('table.tableau-produit')
    if table_element:
        result['table_markdown'] = html_table_to_markdown(str(table_element))
    
    # Extract caption
    caption_element = tableau_figure.select_one('figcaption ul.notes')
    if caption_element:
        notes = [li.text.strip() for li in caption_element.find_all('li')]
        result['caption'] = '\n'.join(notes)
    
    return result
    return result

### Graph to svg

In [172]:
def process_graphique_element(graphique, target_id, index=0):
    """
    Process a graphique element, extract and save its content.
    
    Args:
        graphique: BeautifulSoup element representing the graphique
        target_id: ID of the parent document for file naming
        index: Index of the graphique for naming (default: 0)
    
    Returns:
        Dictionary with extracted information about the graphique
    """
    import os
    import textwrap
    import re
    import requests
    
    # Extract title more robustly - look in parent containers if needed
    title_element = graphique.select_one('h3, h2, figcaption, .title')
    if not title_element:
        # Look in parent containers for title
        parent = graphique.parent
        for _ in range(3):  # Check up to 3 levels up
            if parent:
                title_element = parent.select_one('h3, h2, figcaption, .title')
                if title_element:
                    break
                parent = parent.parent
    
    title_text = title_element.get_text().strip() if title_element else f"Graphique {index+1}"
    
    # Extract all paragraphs that might contain captions
    all_text_elements = graphique.find_all(['p', 'div.note', 'div.caption', 'figcaption'])
    
    # If nothing found in direct children, look in siblings or parent's children
    if not all_text_elements:
        # Try siblings
        for sibling in graphique.find_next_siblings(['div', 'p', 'figcaption']):
            all_text_elements.append(sibling)
        
        # If still nothing, try parent's children
        if not all_text_elements and graphique.parent:
            parent_elements = graphique.parent.find_all(['p', 'div.note', 'div.caption', 'figcaption'])
            for elem in parent_elements:
                if elem != graphique:
                    all_text_elements.append(elem)
    
    # Process text elements to extract lecture, champ, source
    captions = []
    for elem in all_text_elements:
        text = elem.get_text().strip()
        if text and not elem.find('svg'):  # Skip if it contains SVG
            captions.append(text)
    
    caption_text = "\n".join(captions)
    
    # Create directory if it doesn't exist
    os.makedirs('saved_images', exist_ok=True)
    
    result = {
        'title': title_text,
        'captions': caption_text,
        'files': []
    }
    
    # Try to display visual content separately if needed
    svg_content = graphique.select_one('svg')
    if svg_content:
        
        # Save SVG with title and caption included
        # Get the SVG as string
        svg_str = str(svg_content)
        
        # Parse SVG to get its viewBox dimensions
        viewbox_match = re.search(r'viewBox=["\']([\d\s.-]+)["\']', svg_str)
        if viewbox_match:
            viewbox = viewbox_values = [float(v) for v in viewbox_match.group(1).split()]
            svg_width = viewbox_values[2]
            svg_height = viewbox_values[3]
        else:
            # Default dimensions if viewBox not found
            svg_width = 1000
            svg_height = 600
        
        # Calculate additional height needed for title and captions
        title_lines = len(textwrap.wrap(title_text, width=60))
        caption_lines = len(caption_text.split('\n'))
        
        total_height = svg_height + (title_lines * 30) + (caption_lines * 25) + 100  # Extra padding
        
        # Create a new SVG with title and caption
        svg_with_title = f'''<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" width="{svg_width}" height="{total_height}" viewBox="0 0 {svg_width} {total_height}">
  <!-- Title and Caption -->
  <style>
    .title {{ font-family: Arial, sans-serif; font-size: 18px; font-weight: bold; }}
    .caption {{ font-family: Arial, sans-serif; font-size: 14px; }}
  </style>
  
  <!-- Title with wrapping -->
'''
        
        # Add wrapped title
        wrapped_title = textwrap.wrap(title_text, width=60)
        for idx, line in enumerate(wrapped_title):
            svg_with_title += f'  <text x="50" y="{30 + idx * 25}" class="title">{line}</text>\n'
        
        # Calculate position for the original SVG content
        title_height = 20 + (len(wrapped_title) * 25)
        
        # Extract just the content of the original SVG (without the outer <svg> tags)
        svg_content_only = svg_str[svg_str.find('<svg')+4:svg_str.rfind('</svg>')]
        
        svg_with_title += f'''
  <!-- Original SVG content -->
  <g transform="translate(0, {title_height})">
    {svg_content_only}
  </g>
  
  <!-- Captions -->
'''
        
        # Add captions with wrapping
        y_pos = title_height + svg_height + 30
        for caption in caption_text.split('\n'):
            if caption.strip():
                wrapped_caption = textwrap.wrap(caption, width=90)
                for line in wrapped_caption:
                    svg_with_title += f'  <text x="50" y="{y_pos}" class="caption">{line}</text>\n'
                    y_pos += 20
                y_pos += 10  # Extra spacing between paragraphs
        
        svg_with_title += '</svg>'
        
        # Save the enhanced SVG
        filename = f'saved_images/graphique_{target_id}_{index+1}_with_title.svg'
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(svg_with_title)
        
        result['files'].append(filename)
    
    # If it contains an image instead of SVG
    img_content = graphique.select_one('img')
    if img_content and 'src' in img_content.attrs:
        img_src = img_content['src']
        # Handle relative URLs if needed
        if img_src.startswith('/'):
            img_src = f"https://dares.travail-emploi.gouv.fr{img_src}"
        
        # Download and save the image
        response = requests.get(img_src)
        if response.status_code == 200:
            filename = f'saved_images/image_{target_id}_{index+1}.png'
            with open(filename, 'wb') as f:
                f.write(response.content)
            result['files'].append(filename)
    
    return result

### Extract and remove graph and table

In [174]:

def extract_content_from_soup(soup, target_id):
    """
    Extract tables and figures from soup, save them, and return cleaned soup
    
    Args:
        soup: BeautifulSoup object to process
        target_id: ID for naming saved files
    
    Returns:
        tuple: (cleaned_soup, tables_markdown, figures_info)
    """
    # Create a copy of the soup to modify
    soup_copy = copy.copy(soup)
    
    # Extract tables
    tables_markdown = []
    table_elements = soup_copy.select('div[id^="tableau-figure"]')
    
    for i, table_element in enumerate(table_elements):
        table_data = extract_table_with_metadata(table_element)
        tables_markdown.append(table_data)
        # Remove the table from the soup
        table_element.decompose()
    
    # Extract figures/graphiques
    figures_info = []
    graphique_elements = soup_copy.select('div[id^="graphique"], div.graphique, figure')
    
    for i, graphique in enumerate(graphique_elements):
        figure_data = process_graphique_element(graphique, target_id, i)
        figures_info.append(figure_data)
        # Remove the figure from the soup
        graphique.decompose()
    
    return soup_copy, tables_markdown, figures_info


# Find the soup with id 8380766
target_id = '8380766'
target_soup = next((soup_obj for soup_obj in all_soups if soup_obj['id'] == target_id), None)['soup']

# Process the soup to extract tables and figures
import copy
cleaned_soup, tables_markdown, figures_info = extract_content_from_soup(target_soup, target_id)

In [182]:
# Find the soup with id 8380766
target_id = '8380766'
target_soup = next((soup_obj for soup_obj in all_soups if soup_obj['id'] == target_id), None)['soup']

# Select only the main content area
main_content = target_soup.select_one('main#contenu.main[role="main"]')

# Process the soup to extract tables and figures
import copy
cleaned_soup, tables_markdown, figures_info = extract_content_from_soup(main_content, target_id)

figures_info

[{'title': 'graphiqueFigure\xa01 – Décomposition de la pension de retraite selon le sexe, fin 2020',
  'captions': 'Note\xa0: Le champ «\xa0Ensemble des retraités\xa0» correspond aux retraités percevant un droit\n                                 direct ou un droit dérivé. Le champ «\xa0Retraités percevant un droit direct\xa0» correspond\n                                 aux retraités percevant au moins un droit direct, et respectivement au moins un droit\n                                 dérivé pour les «\xa0Retraités percevant un droit dérivé\xa0».\nLecture\xa0: Fin 2020, parmi l’ensemble des retraités percevant un droit direct ou un\n                                 droit dérivé, les femmes perçoivent en moyenne 1\xa0060\xa0euros bruts mensuels de droit\n                                 direct.\nChamp\xa0: Retraités bénéficiaires d’un droit direct ou d’un droit dérivé, résidant en\n                                 Provence-Alpes-Côte d’Azur, vivants au 31 décembre 2020.\nSource\xa0: 

## Rest

In [34]:
# Function to extract article information from the page
def extract_article_info(soup):
    article_info = {}
    
    # Extract title
    title_element = soup.select_one('.titre-titre')
    if title_element:
        article_info['title'] = title_element.text.strip()
    
    # Extract authors
    authors_element = soup.select_one('.auteurs')
    if authors_element:
        article_info['authors'] = authors_element.text.strip()
    
    # Extract description/chapeau
    description_element = soup.select_one('.paragraphe-chapeau')
    if description_element:
        article_info['description'] = description_element.text.strip()
    
    # Extract category
    category_element = soup.select_one('.categorie')
    if category_element:
        article_info['category'] = category_element.text.strip()
    
    # Extract number
    number_element = soup.select_one('.famille-numero')
    if number_element:
        article_info['number'] = number_element.text.strip()
    
    # Extract date
    date_element = soup.select_one('.date-diffusion')
    if date_element:
        # Extract only the date part, not the "Paru le" text
        date_text = date_element.text.strip()
        # Remove "Paru le" if present
        if "Paru le" in date_text:
            date_text = date_text.split("Paru le")[-1].strip()
        article_info['date'] = date_text
    
    # Extract link to collection
    collection_link = soup.select_one('.lien-details a')
    if collection_link:
        article_info['collection_link'] = 'https://www.insee.fr' + collection_link['href']
    
    # Extract downloadable files
    downloadable_files = []
    for file_element in soup.select('.donnee-telechargeable'):
        file_info = {}
        
        # Extract file name/type
        libelle_element = file_element.select_one('.libelle')
        if libelle_element:
            file_info['name'] = libelle_element.text.strip()
        
        # Extract file size
        size_element = file_element.select_one('.taille')
        if size_element:
            file_info['size'] = size_element.text.strip().replace('(', '').replace(')', '')
        
        # Extract file link
        link_element = file_element.find_parent('a')
        if link_element:
            file_info['link'] = 'https://www.insee.fr' + link_element['href']
        
        # Extract file type (pdf, xlsx, etc.)
        if 'link' in file_info:
            file_extension = file_info['link'].split('.')[-1]
            file_info['type'] = file_extension
        
        downloadable_files.append(file_info)
    
    article_info['downloadable_files'] = downloadable_files
    
    return article_info

def extract_breadcrumb(soup):
    """
    Extract the breadcrumb navigation path from the page.
    
    Args:
        soup: BeautifulSoup object containing the page HTML
        
    Returns:
        list: A list of dictionaries containing breadcrumb items with text and links
    """
    breadcrumb = []
    breadcrumb_items = soup.select('.fil-ariane li')
    
    for item in breadcrumb_items:
        breadcrumb_item = {}
        
        # Get the text content
        breadcrumb_item['text'] = item.text.strip()
        
        # Check if the item has a link
        link = item.select_one('a')
        if link:
            breadcrumb_item['link'] = 'https://www.insee.fr' + link['href'] if link['href'].startswith('/') else link['href']
        else:
            breadcrumb_item['link'] = None  # Current page has no link
        
        breadcrumb.append(breadcrumb_item)
    
    return breadcrumb



def extract_tabs(soup):
    """
    Extract tabs names and their href attributes from the page.
    
    Args:
        soup: BeautifulSoup object containing the page HTML
        
    Returns:
        list: A list of dictionaries containing tab names and their href attributes
    """
    tabs = []
    
    # Look specifically for the tabs with class "onglets" and only direct li children
    tab_elements = soup.select('ul.onglets > li.onglet')
    
    for tab in tab_elements:
        tab_info = {}
        link = tab.find('a')
        
        # Get the tab name/text and href if it contains 'onglet'
        if link and 'onglet' in link['href']:
            tab_info['name'] = link.text.strip()
            tab_info['href'] = link['href']
            tabs.append(tab_info)
    
    return tabs


# Extract information from the current page
soup = BeautifulSoup(browser.html, 'html.parser')
breadcrumb = extract_breadcrumb(soup)
article_info = extract_article_info(soup)
tabs = extract_tabs(soup)

# Display the extracted information
for tab in tabs:
    print(tab)
# print("breadcrumb", breadcrumb)
# print("Article Information:")
# article_info

{'name': 'Infographie', 'href': '#onglet-1'}
{'name': 'Étude', 'href': '#onglet-2'}
{'name': 'Documentation', 'href': '#onglet-3'}


In [56]:
def extract_sommaire(soup):
    tab_content = {}
    # Get the table of contents container
    bloc_sommaire = soup.select_one('div.bloc.sommaire#sommaire')
    
    # Extract the title
    title = bloc_sommaire.select_one('div.titre-sommaire').text.strip()
    
    # Extract all list items
    toc_items = []
    for li in bloc_sommaire.select('ol > li'):
        link = li.find('a')
        if link:
            toc_items.append({
                'title': link.text.strip(),
                'href': link.get('href', '')
            })
    
    
    return {
        'title': title,
        'items': toc_items
    }

extracted_content = extract_content(soup)

In [63]:
for element in soup.select('div.corps-publication'):
    # Get all child elements and print their type and class names
    for child in element.find_all(recursive=False):
        element_type = child.name
        class_names = child.get('class', [])
        class_str = ' '.join(class_names) if class_names else 'No class'
        print(f"Element: {element_type}, Class: {class_str}")

Element: div, Class: bloc images
Element: h2, Class: bloc intertitre-impression
Element: div, Class: bloc paragraphes
Element: div, Class: bloc figure
Element: div, Class: bloc paragraphes
Element: h2, Class: bloc intertitre-impression
Element: div, Class: bloc paragraphes
Element: h2, Class: bloc intertitre-impression
Element: div, Class: bloc paragraphes
Element: h2, Class: bloc intertitre-impression
Element: div, Class: bloc paragraphes
Element: div, Class: bloc figure
Element: h2, Class: bloc intertitre-impression
Element: div, Class: bloc paragraphes
Element: div, Class: bloc figure
Element: div, Class: bloc paragraphes
Element: h2, Class: bloc intertitre-impression
Element: div, Class: bloc paragraphes
Element: div, Class: bloc figure
Element: div, Class: bloc paragraphes
Element: div, Class: bloc figure
Element: div, Class: bloc paragraphes
Element: h2, Class: bloc intertitre-impression
Element: div, Class: bloc paragraphes
Element: h2, Class: bloc intertitre-impression
Element:

| Secteur d’activité - Grand Est - Nombre - Travaux d’installation électrique, plomberie et autres travaux d’installation - Travaux de finition - Autres travaux de construction spécialisés (dont couverture et étanchéification) - Ensemble des travaux de construction spécialisés - Autres secteurs d’activité - Total | Établissements labellisés RGE - Grand Est - Répartition par secteur (en %) - 2 270 - 1 580 - 990 - 4 840 - 520 - 5 360 | Établissements labellisés RGE - Grand Est - Part des établissements RGE dans le secteur (en %) - 42 - 30 - 18 - 90 - 10 - 100 | Établissements labellisés RGE - Province - Part des établissements RGE dans le secteur (en %) - 13 - 6 - 6 - 8 - 0 - 1 | Établissements labellisés RGE - Grand Est - Nombre - 11 - 5 - 4 - 7 - 0 - 1 | Salariés en ETP dans les établissements labellisés RGE - Grand Est - Répartition par secteur (en %) - 12 000 - 9 700 - 7 200 - 28 900 - 5 700 - 34 600 | Salariés en ETP dans les établissements labellisés RGE - Grand Est - Part des sala

| Secteur d’activité - Grand Est - Nombre - Travaux d’installation électrique, plomberie et autres travaux d’installation - Travaux de finition - Autres travaux de construction spécialisés (dont couverture et étanchéification) - Ensemble des travaux de construction spécialisés - Autres secteurs d’activité - Total | Établissements labellisés RGE - Grand Est - Répartition par secteur (en %) - 2 270 - 1 580 - 990 - 4 840 - 520 - 5 360 | Établissements labellisés RGE - Grand Est - Part des établissements RGE dans le secteur (en %) - 42 - 30 - 18 - 90 - 10 - 100 | Établissements labellisés RGE - Province - Part des établissements RGE dans le secteur (en %) - 13 - 6 - 6 - 8 - 0 - 1 | Établissements labellisés RGE - Grand Est - Nombre - 11 - 5 - 4 - 7 - 0 - 1 | Salariés en ETP dans les établissements labellisés RGE - Grand Est - Répartition par secteur (en %) - 12 000 - 9 700 - 7 200 - 28 900 - 5 700 - 34 600 | Salariés en ETP dans les établissements labellisés RGE - Grand Est - Part des salariés RGE dans le secteur (en %) - 35 - 28 - 21 - 84 - 16 - 100 | Salariés en ETP dans les établissements labellisés RGE - Province - Part des salariés RGE dans le secteur (en %) - 38 - 38 - 31 - 36 - 0 - 2 | Salariés en ETP dans les établissements labellisés RGE - 36 - 35 - 23 - 31 - 0 - 2 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Travaux d’installation électrique, plomberie et autres travaux d’installation | 2 270 | 42 | 13 | 11 | 12 000 | 35 | 38 | 36 |
| Travaux de finition | 1 580 | 30 | 6 | 5 | 9 700 | 28 | 38 | 35 |
| Autres travaux de construction spécialisés (dont couverture et étanchéification) | 990 | 18 | 6 | 4 | 7 200 | 21 | 31 | 23 |
| Ensemble des travaux de construction spécialisés | 4 840 | 90 | 8 | 7 | 28 900 | 84 | 36 | 31 |
| Autres secteurs d’activité | 520 | 10 | 0 | 0 | 5 700 | 16 | 0 | 0 |
| Total | 5 360 | 100 | 1 | 1 | 34 600 | 100 | 2 | 2 |

In [38]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import json

def html_table_to_flattened_structure(html_string):
    """Convert HTML table with complex headers to flattened DataFrame and JSON"""
    soup = BeautifulSoup(html_string, 'html.parser')
    table = soup.find('table')
    
    # Get table caption/description
    caption = table.find('caption')
    description = caption.text.strip() if caption else ""
    
    # Track cell positions and content
    header_data = []
    max_rows = len(table.select('thead tr'))
    
    # Process header rows to build complete header structure
    for row_idx, row in enumerate(table.select('thead tr')):
        col_pos = 0
        for cell in row.find_all(['th', 'td']):
            # Get colspan and rowspan
            colspan = int(cell.get('colspan', 1))
            rowspan = int(cell.get('rowspan', 1))
            cell_text = cell.get_text().strip()
            
            # Add cell info to our tracking structure
            header_data.append({
                'text': cell_text,
                'row': row_idx,
                'col': col_pos,
                'rowspan': rowspan,
                'colspan': colspan
            })
            
            col_pos += colspan
    
    # Determine total columns in table
    max_cols = max(cell['col'] + cell['colspan'] for cell in header_data)
    
    # Create a matrix to represent the header structure
    header_matrix = [[None for _ in range(max_cols)] for _ in range(max_rows)]
    
    # Fill the matrix with header values
    for cell in header_data:
        for r in range(cell['row'], cell['row'] + cell['rowspan']):
            for c in range(cell['col'], cell['col'] + cell['colspan']):
                if r < max_rows and c < max_cols:
                    header_matrix[r][c] = cell['text']
    
    # Generate flattened column names
    flattened_headers = []
    for col in range(max_cols):
        # Combine all non-None header values for this column
        header_parts = [header_matrix[row][col] for row in range(max_rows) 
                        if header_matrix[row][col] is not None]
        flattened_headers.append(' - '.join(header_parts))
    
    # Extract data rows
    rows_data = []
    for row in table.select('tbody tr'):
        cells = row.find_all(['th', 'td'])
        row_data = [cell.get_text().strip() for cell in cells]
        
        # Handle missing cells
        while len(row_data) < max_cols:
            row_data.append('')
            
        rows_data.append(row_data)
    
    # Create pandas DataFrame
    df = pd.DataFrame(rows_data, columns=flattened_headers)
    
    # Clean data - remove non-breaking spaces
    df = df.replace('\xa0', ' ', regex=True)
    # Remove spaces in numbers
    df = df.apply(lambda x: x.str.replace('\s+', '', regex=True) 
                  if x.dtype == 'object' else x)
    
    return {
        'description': description,
        'dataframe': df,
        'markdown': df.to_markdown(index=False),
        'json': json.loads(df.to_json(orient='records')),
        'csv': df.to_csv(index=False)
    }

# Example usage
result = html_table_to_flattened_structure(html)
print("MARKDOWN TABLE:")
print(result['markdown'])print("\nJSON (first record):")
print(json.dumps(result['json'][0], indent=2, ensure_ascii=False))


MARKDOWN TABLE:
| Secteur d’activité - Grand Est - Nombre                                  |   Établissements labellisés RGE - Grand Est - Répartition par secteur (en %) |   Établissements labellisés RGE - Grand Est - Part des établissements RGE dans le secteur (en %) |   Établissements labellisés RGE - Province - Part des établissements RGE dans le secteur (en %) |   Établissements labellisés RGE - Grand Est - Nombre |   Salariés en ETP dans les établissements labellisés RGE - Grand Est - Répartition par secteur (en %) |   Salariés en ETP dans les établissements labellisés RGE - Grand Est - Part des salariés RGE dans le secteur (en %) |   Salariés en ETP dans les établissements labellisés RGE - Province - Part des salariés RGE dans le secteur (en %) |   Salariés en ETP dans les établissements labellisés RGE |
|:-------------------------------------------------------------------------|-----------------------------------------------------------------------------:|-----------------------

  df = df.apply(lambda x: x.str.replace('\s+', '', regex=True)


In [40]:
def html_table_to_markdown(html_string):
    """Convert HTML table with complex headers to clean markdown format"""
    soup = BeautifulSoup(html_string, 'html.parser')
    table = soup.find('table')
    
    # Track header cells with their positions
    header_data = []
    header_rows = table.select('thead tr')
    max_rows = len(header_rows)
    
    # Process header rows to build complete header structure
    for row_idx, row in enumerate(header_rows):
        col_pos = 0
        for cell in row.find_all(['th', 'td']):
            colspan = int(cell.get('colspan', 1))
            rowspan = int(cell.get('rowspan', 1))
            cell_text = cell.get_text().strip().replace('\xa0', ' ')
            
            header_data.append({
                'text': cell_text,
                'row': row_idx,
                'col': col_pos,
                'rowspan': rowspan,
                'colspan': colspan
            })
            col_pos += colspan
    
    # Determine total columns in table
    max_cols = max(cell['col'] + cell['colspan'] for cell in header_data)
    
    # Build a matrix representing the header structure
    header_matrix = [[None for _ in range(max_cols)] for _ in range(max_rows)]
    
    # Fill the matrix with header values
    for cell in header_data:
        for r in range(cell['row'], cell['row'] + cell['rowspan']):
            for c in range(cell['col'], cell['col'] + cell['colspan']):
                if r < max_rows and c < max_cols:
                    header_matrix[r][c] = cell['text']
    
    # Generate flattened column names
    flattened_headers = []
    for col in range(max_cols):
        # Combine all non-None header values for this column
        header_parts = [header_matrix[row][col] for row in range(max_rows) 
                        if header_matrix[row][col] is not None]
        flattened_headers.append(' - '.join(header_parts))
    
    # Start building markdown table
    markdown_table = []
    markdown_table.append('| ' + ' | '.join(flattened_headers) + ' |')
    markdown_table.append('| ' + ' | '.join(['---' for _ in range(max_cols)]) + ' |')
    
    # Extract data rows
    for row in table.select('tbody tr'):
        cells = row.find_all(['th', 'td'])
        row_data = [cell.get_text().strip().replace('\xa0', ' ').replace('\n', ' ') for cell in cells]
        
        # Handle case where cells are fewer than headers
        while len(row_data) < max_cols:
            row_data.append('')
            
        markdown_table.append('| ' + ' | '.join(row_data) + ' |')
    
    return '\n'.join(markdown_table)

# Convert the table and print the result
markdown_result = html_table_to_markdown(html)

| Secteur d’activité - Grand Est - Nombre | Établissements labellisés RGE - Grand Est - Répartition par secteur (en %) | Établissements labellisés RGE - Grand Est - Part des établissements RGE dans le secteur (en %) | Établissements labellisés RGE - Province - Part des établissements RGE dans le secteur (en %) | Établissements labellisés RGE - Grand Est - Nombre | Salariés en ETP dans les établissements labellisés RGE - Grand Est - Répartition par secteur (en %) | Salariés en ETP dans les établissements labellisés RGE - Grand Est - Part des salariés RGE dans le secteur (en %) | Salariés en ETP dans les établissements labellisés RGE - Province - Part des salariés RGE dans le secteur (en %) | Salariés en ETP dans les établissements labellisés RGE |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Travaux d’installation électrique, plomberie et autres travaux d’installation | 2 270 | 42 | 13 | 11 | 12 000 | 35 | 38 | 36 |
| Travaux de finition | 1 580 | 30 | 6 | 5 | 9 700 | 28 | 