# NDC 3.0 Analysis 
## Building an NLP pipeline to analyse NDCs and track contributions over time.

Countries submit their updated NDCs (NDC 3.0) in 2025. Each round of NDCs should be more ambitious than the last to put us on track to limit warming under 2 degrees. The next [Global Stocktake](https://unfccc.int/documents/631600) will take place in 2028, and is critical for checking progress toward the Paris Agreement. 

**Given the urgency of climate action required, can we check on the progress towards emissions reductions and resilience in each commitment in real time?**

**What can we learn from NDCs to understand how to address financing gaps?**

**How do mitigation, adaptation and L&D feature in NDCs?**

## Method
- Step 1: Collect the NDC Data --> Scrape the NDC Registry for links to NDCs
- Step 2: Download the NDC texts --> Download from the scraped links
- Step 3: Preprocess the Text Data --> Convert unstructured NDC text into clean, structured format for analysis.
- Step 4: Compare Ambition Over Time --> Track changes in emissions reduction targets over time.
- Step 5: Identify Key Sectors & Financial Instruments --> Extract mentions of economic sectors (e.g., energy, transport) and financial instruments (e.g., carbon pricing, green bonds).
- Step 6: Visualise the Findings on a Global Map --> Show how each country's ambition score has changed across NDC updates.
- Step 7: Automate & Scale the Pipeline --> Make this process reproducible for future NDC updates.

In [None]:
!which python
import sys
print(sys.executable)

In [None]:
# Step 1: Collect the NDC Data
# Check for bulk download of NDCs
!pip install requests beautifulsoup4 pandas tqdm
import requests
from bs4 import BeautifulSoup

# Define the URL for the NDC Registry
ndc_url = "https://unfccc.int/NDCREG"

# Send a request to fetch the webpage
response = requests.get(ndc_url)

# Check if the request was successful (Status Code 200 = OK)
if response.status_code == 200:
    print("Successfully accessed the NDC Registry page!")
    soup = BeautifulSoup(response.text, "html.parser")  # Parse the HTML
else:
    print(f"Failed to access the page. Status Code: {response.status_code}")


In [None]:
!pip install selenium webdriver-manager


In [None]:
# Step 1: Scrape the NDC Registry page
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (no browser window)
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

# Start a new browser session
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Open the UNFCCC NDC Registry page
driver.get("https://unfccc.int/NDCREG")
time.sleep(5)  # Wait for JavaScript to load the page

# Scroll down to load more content (if necessary)
scroll_pause_time = 2  # Pause time to allow the page to load content
scroll_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(scroll_pause_time)
    new_scroll_height = driver.execute_script("return document.body.scrollHeight")
    if new_scroll_height == scroll_height:
        break
    scroll_height = new_scroll_height

# Find all <a> tags on the page
all_links = driver.find_elements(By.TAG_NAME, "a")

# Filter out the links that lead to PDF documents (based on the .pdf file extension)
ndc_urls = [link.get_attribute("href") for link in all_links if link.get_attribute("href") and link.get_attribute("href").endswith(".pdf")]

# Close the browser session
driver.quit()

# Show the first 10 NDC document links
print(f"Found {len(ndc_urls)} NDC document links.")
print(ndc_urls[:10])  # Display first 10 links


In [None]:
# Extract unique PDF links
ndc_urls = list(set(ndc_urls))  # Remove duplicates by converting to a set

# Show the cleaned-up number of links
print(f"Found {len(ndc_urls)} unique NDC document links.")
print(ndc_urls[:10])  # Display first 10 links to verify

In [None]:
# Step 2: Download the NDCs
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import os
import time

# Set the download directory (modify this path to your actual folder)
download_directory = os.path.expanduser("~/Desktop/Projects/NDC/ndc_downloads")

# Ensure the directory exists
os.makedirs(download_directory, exist_ok=True)

# Configure Chrome options for automatic PDF download
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_directory,  # Set default download folder
    "download.prompt_for_download": False,  # Disable download prompt
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True,  # Bypass Chrome's PDF viewer
})

# Start a new browser session
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Visit each NDC link and download the PDF
for pdf_url in ndc_urls:
    print(f"Downloading: {pdf_url}")
    driver.get(pdf_url)
    time.sleep(5)  # Wait for the file to download

# Close the browser session
driver.quit()

print("All downloads completed.")


In [None]:
# Step 3: Preprocess the Text Data
!pip install pdfplumber
import pdfplumber
import os
import re
import pandas as pd

In [None]:
# Step 3A: Preprocess the Text Data --> store in list version
# Path where your NDC PDFs are saved
pdf_folder = "/Users/liztan/Desktop/Projects/NDC/ndc_downloads"

# Create a list to hold the extracted data
ndc_data = []

# Iterate through all PDFs in the folder
for file_name in os.listdir(pdf_folder):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, file_name)
        
        # Open the PDF and extract text
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        
        # Clean up and store the text
        ndc_data.append({"country": file_name, "text": text})

# Preview the extracted data
print(ndc_data[:2])  # Show the first 2 entries for example


In [None]:
# Step 3B: Preprocess the Text Data --> store as txt version for later use
import pdfplumber
import os

# Folder where PDFs are stored
pdf_folder = "Desktop/Projects/NDC/ndc_downloads"
text_folder = "Desktop/Projects/NDC/ndc_texts"  

# Ensure the text output folder exists
os.makedirs(text_folder, exist_ok=True)

def extract_text_from_pdf(pdf_path):
    """Extracts text from a given PDF file."""
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error extracting {pdf_path}: {e}")
    return text

# Process each PDF file
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        text = extract_text_from_pdf(pdf_path)
        
        # Save cleaned text to a file
        text_file_path = os.path.join(text_folder, pdf_file.replace(".pdf", ".txt"))
        with open(text_file_path, "w", encoding="utf-8") as text_file:
            text_file.write(text)
        
        print(f"Extracted text from {pdf_file} and saved to {text_file_path}")


In [1]:
# Step 4: Compare Ambition Over Time

import re

# Define keywords related to our three topics
emissions_keywords = ["emission", "carbon", "CO2", "GHG", "reduction", "net-zero", "mitigation"]
sector_keywords = ["energy", "transport", "agriculture", "industry", "waste", "forestry", "buildings"]
finance_keywords = ["investment", "funding", "finance", "carbon market", "green bond", "climate finance"]

# Function to extract relevant info
def extract_info(text, keywords):
    matches = [word for word in keywords if re.search(rf"\b{word}\b", text, re.IGNORECASE)]
    return list(set(matches))  # Remove duplicates

# Process each NDC document
structured_data = []

for entry in ndc_data:
    country = entry["country"]
    text = entry["text"]

    # Extract relevant terms
    emissions_terms = extract_info(text, emissions_keywords)
    sector_terms = extract_info(text, sector_keywords)
    finance_terms = extract_info(text, finance_keywords)

    structured_data.append({
        "country": country,
        "emissions_focus": emissions_terms,
        "sectors_mentioned": sector_terms,
        "finance_mentions": finance_terms
    })

# Show a preview of the results
print(structured_data[:3])  # First 3 results for inspection


NameError: name 'ndc_data' is not defined