# Scraping the urls: year 2025

## 1. Library import

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd

## 2. Chrome initializer and creation of a list of urls to scrape

In [None]:
# Defining different stages of procedures to make the code scrape sequentially to avoid being blocked 
PROCEDURE_STAGE = ['452', '472', '475', '594000', '595000', '478', '600000', '10618', '479', '496', '489', '490', '472', '556001', '556002', '491', '495']

In [None]:
# Code to scrape the URLs of the procedures
BASE_URL = "https://oeil.europarl.europa.eu/oeil/en/search" 

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 10)

all_links = set()

for year in range(2025, 2026):
    print(f"Scraping {year}\n")
    for ptype in PROCEDURE_STAGE:
        print(f"Scraping stage n°{ptype}")

        url = f"{BASE_URL}?fullText.mode=EXACT_WORD&year={year}&procedureType=573004&procedureStage={ptype}"
        driver.get(url)
        time.sleep(1)

        # load more
        while True:
            try:
                btn = wait.until(
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, "button.oei-loadmore-button")
                    )
                )
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(1.2)
            except:
                break

        soup = BeautifulSoup(driver.page_source, "html.parser")

        for a in soup.select("h3.es_document-title a[href]"):
            href = a["href"]
            if href.startswith("/oeil/"):
                all_links.add("https://oeil.europarl.europa.eu" + href)

        print(f"Number of links scraped for now: {len(all_links)}")

driver.quit()

print(f"\nFinal number of links scraped: {len(all_links)}")

Scraping 2025

Scraping stage n°452
Number of links scraped for now: 35
Scraping stage n°472
Number of links scraped for now: 79
Scraping stage n°475
Number of links scraped for now: 79
Scraping stage n°594000
Number of links scraped for now: 92
Scraping stage n°595000
Number of links scraped for now: 92
Scraping stage n°478
Number of links scraped for now: 96
Scraping stage n°600000
Number of links scraped for now: 96
Scraping stage n°10618
Number of links scraped for now: 96
Scraping stage n°479
Number of links scraped for now: 96
Scraping stage n°496
Number of links scraped for now: 96
Scraping stage n°489
Number of links scraped for now: 100
Scraping stage n°490
Number of links scraped for now: 119
Scraping stage n°472
Number of links scraped for now: 119
Scraping stage n°556001
Number of links scraped for now: 119
Scraping stage n°556002
Number of links scraped for now: 119
Scraping stage n°491
Number of links scraped for now: 119
Scraping stage n°495
Number of links scraped for n

In [4]:
# Save the data
df = pd.DataFrame([[all_links]], columns=["urls"])
df.to_csv('list_urls_2025.csv')