# Scraping the Legislative Observatory : scraping all information available on each procedure except the leglisative texts themselves

## 1. Library import

In [1]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import ast

## 2. Chrome driver initialization and creation of a list of urls to scrape

In [None]:
# Dataframe containing only one row with 
df = pd.read_csv('list_urls_2025.csv') # Update : December 2025

In [3]:
# Turning the dataframe's unique row into a list
urls_total = ast.literal_eval(df['urls'][0])
# Only keeping the ordinary legislative procedures 
urls_cod = [url for url in urls_total if re.search(r'COD', url)]
# Splitting the list into multiple lists of 100 urls
list_size = 50
lists = [urls_cod[i:i+list_size] for i in range(0, len(urls_cod), list_size)]
print(len(lists))

3


In [4]:
# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

## 3. Scraping the Legislative Observatory

In [5]:
# Function to extract the reference from the URL
def extract_reference(url):
    # Use regex to extract the reference pattern from the URL
    match = re.search(r'en/procedure-file\?reference=(\d+)/(\d+)\(([A-Za-z]+)\)', url)
    if match:
        # Reconstruct the reference string in the same format
        return f"{match.group(1)}/{match.group(2)}({match.group(3)})"
    else:
        # Return None if the pattern isn't found
        return None
    

# Function to extract texts and links from a cell into a list or string
def extract_cell_content(cell):
    """Extracts text and links from a cell into a list or string."""
    links = cell.find_all("a")
    if links:
        # Initialize a list to store both text and link data
        values = []
        for content in cell.contents:
            # If the content is an <a> tag, store its text and href
            if getattr(content, "name", None) == "a":
                values.append({
                    "text": content.get_text(strip=True),
                    "href": content.get("href")
                })
            # If it's plain text, store it separately
            elif isinstance(content, str):
                text = content.strip()
                if text:
                    values.append({"text": text})
        # Return a list of dictionaries for each element found in the cell
        return values
    else:
        # If no links, just return the text content of the cell
        return cell.get_text(" ", strip=True)
    
def clean_text(text):
    """Helper to remove brackets and extra whitespace."""
    if not text: return ""
    # Removes [ and ] and strips whitespace
    return re.sub(r'[\[\]]', '', str(text)).strip()


def parse_table_doc_gateway(table, institution_name):
    """
    Parses table for the "documentation gateway" and injects Institution immediately.
    Cleans content during extraction.
    """
    headers = [th.get_text(strip=True) for th in table.select("thead th")]
    rows = []

    # Case 1: Row-based table
    if headers:
        for tr in table.select("tbody tr"):
            cells = tr.find_all(["td", "th"])
            if len(cells) >= len(headers):
                # Build dict and clean content on the fly
                row_data = {"Institution": institution_name}
                for i, header in enumerate(headers):
                    # Direct cleaning here
                    content = extract_cell_content(cells[i])
                    row_data[header] = clean_text(content)
                rows.append(row_data)
        return rows

    # Case 2: Key-value style
    else:
        kv_dict = {"Institution": institution_name}
        for tr in table.select("tr"):
            th = tr.find("th")
            td = tr.find("td")
            if th and td:
                key = th.get_text(strip=True)
                kv_dict[key] = clean_text(extract_cell_content(td))
        return [kv_dict] if kv_dict != {"Institution": institution_name} else []


# Function to parse a table into a list of dicts or a dict
def parse_table(table):
    """
    Parses an HTML table into:
      - a list of dicts if it has <thead> (row-based table),
      - a single dict if it's key–value style (<th><td> rows).
    """
    # Extract all header names from the table
    headers = [th.get_text(strip=True) for th in table.select("thead th")]

    # Case 1: standard row-based table with headers
    if headers:
        rows = []
        # Loop through each row in the <tbody>
        for tr in table.select("tbody tr"):
            cells = tr.find_all(["td", "th"])
            # Match cells to headers if lengths are equal
            if len(cells) == len(headers):
                row = {
                    headers[i]: extract_cell_content(cells[i])
                    for i in range(len(headers))
                }
            # If only one cell (possibly a note row)
            elif len(cells) == 1:
                row = {"Note": extract_cell_content(cells[0])}
            # Otherwise, handle mismatched columns
            else:
                row = {
                    headers[i] if i < len(headers) else f"Column {i+1}": extract_cell_content(cells[i])
                    for i in range(len(cells))
                }
            # Append processed row dictionary to list
            rows.append(row)
        return rows

    # Case 2: key–value style table (no headers)
    else:
        kv_dict = {}
        # Iterate through each table row
        for tr in table.select("tr"):
            th = tr.find("th")
            td = tr.find("td")
            # If both a header cell and a data cell exist, store them as key-value pairs
            if th and td:
                kv_dict[th.get_text(strip=True)] = extract_cell_content(td)
        # Return a dictionary of key–value pairs
        return kv_dict


def scrape_key_players(soup):
    """Specialized extraction for the Key Players section including shadows."""
    result = {}
    key_players_section = soup.select_one('#erplAccordionKeyPlayers')
    if not key_players_section:
        return 'N/A'

    for li in key_players_section.select('ul > li.es_accordion-item'):
        inst_name = li.find('button').get_text(strip=True)
        institution_entries = []
        tables = li.select('table')

        if not tables:
            result[inst_name] = []
            continue

        for table in tables:
            headers = [th.get_text(" ", strip=True) for th in table.select('thead th')]
            last_entry = None

            for row in table.select('tbody tr'):
                # 1. Handle Pending rows
                if "Pending final decision" in row.get_text():
                    entry = {h: "Pending final decision" for h in headers}
                    institution_entries.append(entry)
                    continue

                # 2. Handle Shadow Rapporteurs (Merged into last entry)
                if row.select_one('#collapseShadowRapporteur'):
                    shadows = [a.get_text(" ", strip=True) for a in row.select('a.rapporteur')]
                    if last_entry and shadows:
                        last_entry["Shadow rapporteurs"] = shadows
                    continue

                # 3. Handle Normal Rows
                cells = row.find_all(['th', 'td'])
                if not cells: continue
                
                entry = {headers[i]: " ".join(cells[i].stripped_strings) 
                         for i in range(min(len(headers), len(cells)))}
                
                institution_entries.append(entry)
                last_entry = entry

        result[inst_name] = institution_entries
    return result  

In [6]:
# Initialize tracking variables, to keep track of the urls that were not scraped correctly
url_not_found = []
url_no_title = []
url_missing_title_span = {}  # dict with the following form: {url: [sections_with_missing_title_span]}

# Initialize main DataFrame
df = pd.DataFrame(columns=[
    'url', 
    'reference', 
    'title', 
    'subjects', 
    'key_players',
    'key_events', 
    'technical_info', 
    'documentation_gateway',
    'transparency', 
    'final_act'
])

In [None]:
# Loop over each list of URLs
j = 1
for urls in lists:
    print(f'Processing list {j} of {len(lists)}')
    
    # Temporary storage for rows collected from this batch
    batch_rows = []

    # Loop through each URL in the current list
    for url in urls:
        try:
            # Extract reference from the URL
            reference_ = extract_reference(url)

            # If no reference found, log and skip to next
            if not reference_:
                url_not_found.append(url)
                batch_rows.append({
                    'url': url,
                    'reference': 'N/A',
                    'title': 'N/A',
                    'subjects': 'N/A',
                    'key_players': 'N/A',
                    'key_events': 'N/A',
                    'technical_info': 'N/A',
                    'documentation_gateway': 'N/A',
                    'transparency': 'N/A',
                    'final_act': 'N/A'
                })
                continue

            # Open the webpage using Selenium
            driver.get(url)
            time.sleep(1)  # Allow time for page to load fully
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Scrape the title of the procedure
            title_element = soup.find('h2', class_="es_title-h2 mb-3")
            title = title_element.text.strip() if title_element else 'N/A'
            if title == 'N/A':
                url_no_title.append(url)

            # Scrape the subjects of the procedure
            subjects = 'N/A'
            subj_label = soup.find("p", class_="font-weight-bold mb-1", string="Subject")

            if subj_label:
                subj_value = subj_label.find_next_sibling()
                if subj_value:
                    # Adding ", " as the first argument to get_text() automatically 
                    # inserts a comma between each text element found 
                    # (the codes and labels).
                    subjects = subj_value.get_text(", ", strip=True)

            # Scrape the key players section (institutions involved)
            key_players = scrape_key_players(soup)

            # Scrape key events (chronological information)
            key_events = 'N/A'
            key_events_section = soup.find("h2", string=lambda x: x and "Key events" in x)
            if key_events_section:
                key_events = {}
                table = key_events_section.find_next("table")
                if table:
                    rows = parse_table(table)
                    # Use date as key if available
                    for row in rows:
                        date_key = row.get("Date", f"event_{len(key_events)+1}")
                        key_events.setdefault(date_key, []).append(row)

            # Scrape technical information section
            tech_info = 'N/A'
            tech_info_section = soup.find("h2", string=lambda x: x and "Technical information" in x)
            if tech_info_section:
                table = tech_info_section.find_next("table")
                if table:
                    tech_info = parse_table(table)

            section_doc = soup.find("div", id="section6")
            doc_list = []

            if section_doc:
                for li in section_doc.find_all("li", class_="es_accordion-item"):
                    title_span = li.select_one("button span.t-x")
                    if not title_span:
                        continue
                        
                    inst_name = title_span.get_text(strip=True)
                    
                    for table in li.select("table"):
                        # This now returns a list of clean dicts including the institution
                        table_data = parse_table_doc_gateway(table, inst_name)
                        doc_list.extend(table_data)

                # Sort the list by date immediately before saving to the DF
                doc_list.sort(
                    key=lambda d: datetime.strptime(d.get("Date", "01/01/1900"), "%d/%m/%Y") 
                    if d.get("Date") else datetime(1900, 1, 1)
                )

            documentation = doc_list if doc_list else 'N/A'
            
            # Scrape transparency section (if available)
            transp = 'N/A'
            section_transp = soup.find("div", id="section8")
            if section_transp:
                transp = {}
                for li in section_transp.find_all("li", class_="es_accordion-item"):
                    title_span = li.select_one("button span.t-x")
                    if not title_span:
                        url_missing_title_span.setdefault(url, []).append("Transparency")
                        continue
                    name = title_span.get_text(strip=True)
                    transp[name] = []
                    for table in li.select("table"):
                        transp[name].extend(parse_table(table))

            # Scrape final act section (links to final documents)
            final_act = 'N/A'
            section_final_act = soup.find("div", id="section9")
            if section_final_act:
                final_act = {}
                for li in section_final_act.select("div.es_links-list ul li"):
                    for link in li.find_all("a", href=True):
                        key = link.get_text(strip=True)
                        final_act[key] = link["href"]

            # Store all scraped data into one dictionary for this URL
            batch_rows.append({
                'url': url,
                'reference': reference_,
                'title': title,
                'subjects': subjects,
                'key_players': key_players,
                'key_events': key_events,
                'technical_info': tech_info,
                'documentation_gateway': documentation,
                'transparency': transp,
                'final_act': final_act
            })

        # Error handling for any issues during scraping
        except Exception as e:
            print(f"Error processing {url}: {e}")
            url_not_found.append(url)
            # Append placeholder data for failed URLs
            batch_rows.append({
                'url': url,
                'reference': 'N/A',
                'title': 'N/A',
                'subjects': 'N/A',
                'key_players': 'N/A',
                'key_events': 'N/A',
                'technical_info': 'N/A',
                'documentation_gateway': 'N/A',
                'transparency': 'N/A',
                'final_act': 'N/A'
            })
            continue

    # After processing one list of URLs, save intermediate progress
    if batch_rows:
        batch_df = pd.DataFrame(batch_rows)
        # Append current batch to the main dataframe
        df = pd.concat([df, batch_df], ignore_index=True)

        # Save batch progress to a CSV file
        df.to_csv(f"scrape_progress_batch_2025_{j}.csv", index=False)
        print(f"Batch {j} saved with {len(batch_rows)} URLs")

    j += 1  # Move to the next batch index

# Close Selenium driver after all scraping is done
driver.quit()
print('Scraping complete.')

Processing list 1 of 3
Batch 1 saved with 50 URLs
Processing list 2 of 3
Batch 2 saved with 50 URLs
Processing list 3 of 3
Batch 3 saved with 19 URLs
Scraping complete.


In [8]:
# Save the data
df.to_csv("final_scrape_2025.csv", index=False)

In [9]:
# Convert tracking structures to DataFrames
df_not_found = pd.DataFrame({'url': url_not_found})
df_no_title = pd.DataFrame({'url': url_no_title})

# For missing title spans, expand the dict into a DataFrame
df_missing_title_span = pd.DataFrame(
    [(url, section) for url, sections in url_missing_title_span.items() for section in sections],
    columns=['url', 'missing_section']
)

# Save them all
df_not_found.to_csv("urls_not_found_2025.csv", index=False)
df_no_title.to_csv("urls_no_title_2025.csv", index=False)
df_missing_title_span.to_csv("urls_missing_title_span_2025.csv", index=False)

In [10]:
# Display the scraped data
df

Unnamed: 0,url,reference,title,subjects,key_players,key_events,technical_info,documentation_gateway,transparency,final_act
0,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0255(COD),Justice programme 2028-2034,7.40.02 Judicial cooperation in civil and comm...,{'European Parliament': [{'Joint committee res...,"{'03/09/2025': [{'Date': '03/09/2025', 'Event'...","{'Procedure reference': '2025/0255(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...",,
1,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0405(COD),Placing on the market of genetically modified ...,"3.10.09.06 Agro-genetics, GMOs, 4.60.02 Consum...",{'European Parliament': [{'Joint committee res...,"{'16/12/2025': [{'Date': '16/12/2025', 'Event'...","{'Procedure reference': '2025/0405(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...",,
2,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0540(COD),"Union support for asylum, migration and integr...",7.10 Free movement and integration of third-co...,{'European Parliament': [{'Committee responsib...,"{'16/07/2025': [{'Date': '16/07/2025', 'Event'...","{'Procedure reference': '2025/0540(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...",{'Other Members': [{'Name': [{'text': 'ASENS L...,
3,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0550(COD),'AgoraEU' programme 2028–2034,"1.20 Citizen's rights, 4.10.03 Child protectio...",{'European Parliament': [{'Joint committee res...,"{'16/07/2025': [{'Date': '16/07/2025', 'Event'...","{'Procedure reference': '2025/0550(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...","{'Rapporteurs, Shadow Rapporteurs and Committe...",
4,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0227(COD),Global Europe,"6.20 Common commercial policy in general, 8.70...",{'European Parliament': [{'Joint committee res...,"{'16/07/2025': [{'Date': '16/07/2025', 'Event'...","{'Procedure reference': '2025/0227(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...",,
...,...,...,...,...,...,...,...,...,...,...
114,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0232(COD),Protection of workers from the risks related t...,"4.15.15 Health and safety at work, occupationa...",{'European Parliament': [{'Committee responsib...,"{'18/07/2025': [{'Date': '18/07/2025', 'Event'...","{'Procedure reference': '2025/0232(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...","{'Rapporteurs, Shadow Rapporteurs and Committe...",
115,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0074(COD),Extension of the timeframe for the establishme...,"7.30.30 Action to combat crime, 7.40.04 Judici...",{'European Parliament': [{'Committee responsib...,"{'02/04/2025': [{'Date': '02/04/2025', 'Event'...","{'Procedure reference': '2025/0074(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...",,{'pdfFinal act': '/oeil/en/procedure-file/pdf?...
116,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0056(COD),Common rules for imports: suspension of certai...,"6.20.02 Export/import control, trade defence, ...",{'European Parliament': [{'Committee responsib...,"{'07/03/2025': [{'Date': '07/03/2025', 'Event'...","{'Procedure reference': '2025/0056(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...",,{'pdfFinal act': '/oeil/en/procedure-file/pdf?...
117,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0404(COD),Simplifying and reducing the burden of the rul...,3.30.06 Information and communication technolo...,{'European Parliament': [{'Committee responsib...,"{'16/12/2025': [{'Date': '16/12/2025', 'Event'...","{'Procedure reference': '2025/0404(COD)', 'Pro...","[{'Institution': 'European Commission', 'Docum...",,
