
# 📊 EFSA Dossier Scraper – Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pieterwindels/EFSA_dossier/blob/main/efsa_dossier_scraper.ipynb)

🔗 View the code on GitHub: [github.com/pieterwindels/EFSA_dossier](https://github.com/pieterwindels/EFSA_dossier)

---


In [None]:

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install necessary packages
!apt-get update
!apt install chromium-chromedriver
!pip install --upgrade selenium

# Import libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import datetime
import csv
import pandas as pd

# Set up the Selenium Chrome WebDriver
def web_driver():
  options = Options()
  options.add_argument('--verbose')
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-gpu')
  options.add_argument('--disable-dev-shm-usage')
  options.add_argument('--window-size=1920,1200')
  prefs = {'download.default_directory' : "/content/drive/MyDrive"}
  options.add_experimental_option('prefs', prefs)
  driver = webdriver.Chrome(options=options)
  return driver

# Dossiers and Nodes
final_nodes_old = ['31830', '31836', '31840', '31844', '31849', '31853', '31857', '31861', '31865', '31869',
                   '31873', '31877', '31881', '31886', '31891', '31895', '31899', '31903', '31907', '31912',
                   '31916', '31920', '31925', '31929', '31933', '31937', '31942', '31946', '31950', '31954',
                   '31959', '31963', '31967', '31972', '31976', '31980', '31984', '31989', '31993', '31997',
                   '32001', '32005', '32009', '32013', '32017', '32021', '32025', '32028', '32032']

final_nodes_new = ['37374', '37378', '37382', '37386', '37390', '37394',
                   '37398', '37402', '37406', '37410', '37414', '37416']

dossiernrs_old = ['GMFF-2022-6232', 'GMFF-2021-0071', 'GMFF-2022-11530', 'GMFF-2022-6595',
                  'GMFF-2021-1530', 'GMFF-2023-17394', 'GMFF-2023-14732', 'GMFF-2022-5890',
                  'GMFF-2021-2473', 'GMFF-2022-10651']

dossiernrs_new = ['GMFF-2024-21774', 'GMFF-2023-21116', 'GMFF-2024-24650',
                  'GMFF-2023-21132', 'GMFF-2024-22091', 'GMFF-2024-22152']

# Start browser session
driver = web_driver()
driver.get('https://open.efsa.europa.eu/dossier/GMFF-2022-9450')
try:
  driver.find_element(By.XPATH, '//button[@id="ppms_cm_agree-to-all"]').click()
except:
  pass

# Data scraping logic
data_dict = {}
for dossier in dossiernrs_old:
  lt = [str(datetime.date.today())]
  for node in final_nodes_old:
    driver.get(f'https://open.efsa.europa.eu/dossier/{dossier}?type=template&key={node}')
    try:
      element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "selected")]/span[2]/span')))
      element.click()
      lt.append(len(driver.find_elements(By.XPATH, '//div[@class="ant-typography"]')))
    except Exception as e:
      print(f"[OLD] {dossier} - {node}: {str(e)}")
  data_dict[dossier] = lt

for dossier in dossiernrs_new:
  lt = [str(datetime.date.today())]
  for node in final_nodes_new:
    driver.get(f'https://open.efsa.europa.eu/dossier/{dossier}?type=template&key={node}')
    try:
      element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "selected")]')))
      element.click()
      lt.append(len(driver.find_elements(By.XPATH, '//div[@class="ant-typography"]')))
    except Exception as e:
      print(f"[NEW] {dossier} - {node}: {str(e)}")
  data_dict[dossier] = lt

driver.quit()

# Write results to CSV
csv_path = '/content/drive/MyDrive/Colab Notebooks/BASF_EFSA/efsalinks.csv'
with open(csv_path, mode='a', newline='') as file:
  writer = csv.writer(file)
  for key, values in data_dict.items():
    writer.writerow([key] + values)

# Load CSV and compare data
df = pd.read_csv(csv_path, header=None)
df.columns = ['key'] + [f'value_{i}' for i in range(1, len(df.columns))]
items = df['key'].unique()

for i in items:
    df_n = df[df['key'] == i]
    if 'value_1' in df_n.columns:
        df_subset = df_n.dropna(axis='columns').drop(columns=['value_1'])
    else:
        df_subset = df_n.dropna(axis='columns')
    if len(df_subset) < 2:
        print(f"Dossier {i} does not have enough rows for comparison.")
        continue
    comparison = df_subset.iloc[-1] == df_subset.iloc[-2]
    if comparison.all():
        print(f"Dossier {i}: The last row is identical to the previous row.")
    else:
        print(f"New info available for dossier {i}!")
        different_columns = comparison.index[~comparison]
        print(f"Differences found in section(s): {list(different_columns)}")
