# 1. Set up the Huggingface Climate Policy Radar dataset.

In [43]:
import os
import regex as re
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import pgai
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from datasets import load_dataset, Features, Value
from functions import generate_embeddings_for_text
import glob

tqdm.pandas()

In [2]:
# Login using e.g. `huggingface-cli login` in command line to access this dataset

ds = load_dataset("ClimatePolicyRadar/all-document-text-data")
ds = ds["train"]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/42 [00:00<?, ?it/s]

In [3]:
ds.features

{'document_id': Value(dtype='string', id=None),
 'document_metadata.collection_summary': Value(dtype='string', id=None),
 'document_metadata.collection_title': Value(dtype='string', id=None),
 'document_metadata.corpus_type_name': Value(dtype='string', id=None),
 'document_metadata.corpus_import_id': Value(dtype='string', id=None),
 'document_metadata.category': Value(dtype='string', id=None),
 'document_metadata.description': Value(dtype='string', id=None),
 'document_metadata.document_title': Value(dtype='string', id=None),
 'document_metadata.family_import_id': Value(dtype='string', id=None),
 'document_metadata.family_slug': Value(dtype='string', id=None),
 'document_metadata.geographies': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.import_id': Value(dtype='string', id=None),
 'document_metadata.languages': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.metadata': {'author': Sequence(feature=Va

In [4]:
flat_ds = ds.flatten()

In [5]:
for key in flat_ds.features.keys():
    print(type(flat_ds[18:19][key][0]))

<class 'str'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'bool'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'list'>
<class 'int'>
<class 'str'>
<class 'int'>


## Save 100000 chunks in Postgres

In [None]:
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os

load_dotenv # This loads the .env file into os.environ
batch_df = pd.DataFrame(flat_ds[:100000])
# Set up the database connection using SQLAlchemy
engine = create_engine(os.getenv("DB_URL"))

# Write the pandas DataFrame to the PostgreSQL database
batch_df.to_sql('climate_policy_radar', engine, if_exists='replace', index=False)

615

## Attempt to change arrays to strings for entire dataset

Just keeping this here for future reference. I don't think it's needed

In [21]:
def array_to_string(batch):
    keys_to_process = [
        "document_metadata.geographies",
        "document_metadata.languages",
        "languages",
        "pdf_data_page_metadata.dimensions",
        "text_block.coords"
    ]
    for key in keys_to_process:
        # Convert numpy arrays to lists
        batch[key] = [str(x) for x in batch[key]]
    return batch

In [None]:
# Apply the function to the dataset. This takes the longest time
flat_ds = flat_ds.map(array_to_string, batched = True)

## Read the data from the database

So it's easier to access the data in case the kernel crashes and had to re-run the codes again

In [7]:
# Read the table
df = pd.read_sql("SELECT * FROM climate_policy_radar", engine)
df.head()


Unnamed: 0,document_id,document_metadata.collection_summary,document_metadata.collection_title,document_metadata.corpus_type_name,document_metadata.corpus_import_id,document_metadata.category,document_metadata.description,document_metadata.document_title,document_metadata.family_import_id,document_metadata.family_slug,...,pipeline_metadata.parser_metadata.azure_model_id,pipeline_metadata.parser_metadata.parsing_date,text_block.text_block_id,text_block.language,text_block.type,text_block.type_confidence,text_block.coords,text_block.page_number,text_block.text,text_block.index
0,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,0,en,title,1.0,"{{70.452,123.7392},{524.1816,123.7392},{524.18...",0.0,Draft of the National Energy and Climate Plan ...,0
1,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,1,en,Text,1.0,"{{69.7176,208.4256},{124.21440000000001,208.79...",0.0,July 2021,1
2,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,2,en,Text,1.0,"{{217.1952,685.1376},{286.1856,685.1376},{286....",0.0,REPUBLIKA SHOIPERISE,2
3,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,3,en,Text,1.0,"{{195.2928,690.6096},{308.448,690.6096},{308.4...",0.0,MINISTRIA E TURIZMIT DHE MJEDISIT,3
4,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,4,en,Text,1.0,"{{76.9968,708.5015999999999},{182.88,708.1344}...",0.0,MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE,4


# 2. Embeddings generation

## 2.1 Load climateBERT

In [8]:
EMBEDDING_MODEL_LOCAL_DIR = os.getenv('EMBEDDING_MODEL_LOCAL_DIR')
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

In [None]:
# Download
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)
model = AutoModelForMaskedLM.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)

# Save it to a  local_models folder
tokenizer.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)



In [9]:
# Load the embedding model
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model = AutoModel.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaModel were not initialized from the model checkpoint at local_model/climatebert/distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Checking existing documents' country


In [10]:
query = """
SELECT DISTINCT "document_metadata.geographies"
FROM climate_policy_radar
WHERE "document_metadata.geographies" IS NOT NULL;
"""

geos = pd.read_sql(query, engine)
print(geos)


   document_metadata.geographies
0                          {SRB}
1                          {MKD}
2                          {GBR}
3                          {TUV}
4                          {FRA}
5                          {ALB}
6                          {EUR}
7                          {MNE}
8                          {AZE}
9                          {CAN}
10                         {JPN}
11                         {BRA}
12                         {DEU}
13                         {XKX}
14                         {CHN}
15                         {ZAF}
16                         {BIH}
17                         {IRL}


## 2.2 Embedding all documents for all countries

Generate embeddings for all documents and upload them into the database.

A new table is needed, this will be created through the create_table.sql file.So go to create_table.sql and run the query to create the table. Remember to select the Postgres Server at the bottom, and highlight the code and right click to run query. This will create a new table in the database.

In [None]:
# Make sure column is string for filtering
df["document_metadata.geographies"] = df["document_metadata.geographies"].astype(str)

# Extract 3-letter codes like 'ALB', 'DEU', etc.
df["country_code"] = df["document_metadata.geographies"].str.extract(r"\{(\w+)\}")

# Get all unique codes
country_codes = df["country_code"].dropna().unique()

# Store each country chunk in a dictionary
country_chunks = {}

for code in tqdm(country_codes, desc="Filtering by country"):
    country_chunks[code] = df[df["country_code"] == code]

Filtering by country: 100%|██████████| 18/18 [00:01<00:00, 13.82it/s]


In [25]:
print(len(country_chunks))

#how many documents for each country
for code, chunk in country_chunks.items():
    print(f"{code}: {len(chunk)} documents")

18
ALB: 13797 documents
BIH: 16552 documents
ZAF: 11132 documents
FRA: 351 documents
MNE: 3152 documents
XKX: 5314 documents
MKD: 14146 documents
SRB: 9976 documents
EUR: 3840 documents
TUV: 4896 documents
BRA: 1067 documents
CAN: 91 documents
DEU: 126 documents
CHN: 644 documents
AZE: 7206 documents
JPN: 119 documents
IRL: 3 documents
GBR: 249 documents


In [None]:
# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Batch size for embedding
batch_size = 10000

from sqlalchemy.orm import sessionmaker


load_dotenv()
engine = create_engine(os.getenv("DB_URL"))
Session = sessionmaker(bind=engine)
session = Session()

# Loop through each country's data in the dictionary
for code, chunk in tqdm(country_chunks.items(), desc="Processing all countries"):
    original_texts = chunk["text_block.text"]
    doc_ids = chunk["document_id"]
    source_urls = chunk["document_metadata.source_url"]  # ✅ new

    num_batches = (len(texts) + batch_size - 1) // batch_size

    all_embeddings = []
    all_doc_ids = []
    all_texts = []
    all_urls = []

    for i in tqdm(range(num_batches), desc=f"Embedding {code}", leave=False):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(texts))

        batch_texts = texts.iloc[start_idx:end_idx].reset_index(drop=True)
        batch_ids = doc_ids.iloc[start_idx:end_idx].reset_index(drop=True)
        batch_urls = source_urls.iloc[start_idx:end_idx].reset_index(drop=True)

        mask = batch_texts.apply(lambda x: isinstance(x, str) and x.strip() != "")
        batch_texts = batch_texts[mask]
        batch_ids = batch_ids[mask]
        batch_urls = batch_urls[mask]

        batch_embeddings = batch_texts.progress_apply(
            lambda text: generate_embeddings_for_text(text, model, tokenizer)
        )

        all_embeddings.extend(batch_embeddings)
        all_doc_ids.extend(batch_ids)
        all_texts.extend(batch_texts)
        all_urls.extend(batch_urls)

    # Upload to DB
    for doc_id, embedding, original_text, url in tqdm(
        zip(all_doc_ids, all_embeddings, all_texts, all_urls),
        total=len(all_doc_ids),
        desc=f"Uploading {code}"
    ):
        stmt = text("""
            INSERT INTO document_embeddings (document_id, country_code, original_text, source_hyperlink, embedding)
            VALUES (:document_id, :country_code, :original_text, :source_hyperlink, :embedding)
        """)
        session.execute(stmt, {
            "document_id": doc_id,
            "country_code": code,
            "original_text": original_text,
            "source_hyperlink": url,
            "embedding": embedding
        })


    session.commit()


print("✅ All embeddings and original texts uploaded directly.")


Processing all countries:   0%|          | 0/18 [00:00<?, ?it/s]

Embedding ALB:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading ALB:   0%|          | 0/13797 [00:00<?, ?it/s]

Embedding BIH:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading BIH:   0%|          | 0/13797 [00:00<?, ?it/s]

Embedding ZAF:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading ZAF:   0%|          | 0/11132 [00:00<?, ?it/s]

Embedding FRA:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading FRA:   0%|          | 0/351 [00:00<?, ?it/s]

Embedding MNE:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading MNE:   0%|          | 0/3152 [00:00<?, ?it/s]

Embedding XKX:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading XKX:   0%|          | 0/5314 [00:00<?, ?it/s]

Embedding MKD:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading MKD:   0%|          | 0/13797 [00:00<?, ?it/s]

Embedding SRB:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading SRB:   0%|          | 0/9976 [00:00<?, ?it/s]

Embedding EUR:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading EUR:   0%|          | 0/3840 [00:00<?, ?it/s]

Embedding TUV:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading TUV:   0%|          | 0/4896 [00:00<?, ?it/s]

Embedding BRA:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading BRA:   0%|          | 0/1067 [00:00<?, ?it/s]

Embedding CAN:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading CAN:   0%|          | 0/91 [00:00<?, ?it/s]

Embedding DEU:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading DEU:   0%|          | 0/126 [00:00<?, ?it/s]

Embedding CHN:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading CHN:   0%|          | 0/644 [00:00<?, ?it/s]

Embedding AZE:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading AZE:   0%|          | 0/7206 [00:00<?, ?it/s]

Embedding JPN:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading JPN:   0%|          | 0/119 [00:00<?, ?it/s]

Embedding IRL:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading IRL:   0%|          | 0/3 [00:00<?, ?it/s]

Embedding GBR:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/3797 [00:00<?, ?it/s]

Uploading GBR:   0%|          | 0/249 [00:00<?, ?it/s]

✅ All embeddings and original texts uploaded directly.
