# 1. Set up the Huggingface Climate Policy Radar dataset.

In [36]:
import os
import regex as re
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import torch
import pgai
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from datasets import load_dataset, Features, Value
from functions import generate_embeddings_for_text

tqdm.pandas()

In [2]:
# Login using e.g. `huggingface-cli login` in command line to access this dataset

ds = load_dataset("ClimatePolicyRadar/all-document-text-data")
ds = ds["train"]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/42 [00:00<?, ?it/s]

In [3]:
ds.features

{'document_id': Value(dtype='string', id=None),
 'document_metadata.collection_summary': Value(dtype='string', id=None),
 'document_metadata.collection_title': Value(dtype='string', id=None),
 'document_metadata.corpus_type_name': Value(dtype='string', id=None),
 'document_metadata.corpus_import_id': Value(dtype='string', id=None),
 'document_metadata.category': Value(dtype='string', id=None),
 'document_metadata.description': Value(dtype='string', id=None),
 'document_metadata.document_title': Value(dtype='string', id=None),
 'document_metadata.family_import_id': Value(dtype='string', id=None),
 'document_metadata.family_slug': Value(dtype='string', id=None),
 'document_metadata.geographies': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.import_id': Value(dtype='string', id=None),
 'document_metadata.languages': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.metadata': {'author': Sequence(feature=Va

In [4]:
flat_ds = ds.flatten()

In [5]:
for key in flat_ds.features.keys():
    print(type(flat_ds[18:19][key][0]))

<class 'str'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'bool'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'list'>
<class 'int'>
<class 'str'>
<class 'int'>


## Save 100000 chunks in Postgres

In [10]:
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

load_dotenv # This loads the .env file into os.environ
batch_df = pd.DataFrame(flat_ds[:100000])
# Set up the database connection using SQLAlchemy
engine = create_engine(os.getenv("DB_URL"))

# Write the pandas DataFrame to the PostgreSQL database
batch_df.to_sql('climate_policy_radar', engine, if_exists='replace', index=False)

615

## Attempt to change arrays to strings for entire dataset

Just keeping this here for future reference. I don't think it's needed

In [None]:
def array_to_string(batch):
    keys_to_process = [
        "document_metadata.geographies",
        "document_metadata.languages",
        "languages",
        "pdf_data_page_metadata.dimensions",
        "text_block.coords"
    ]
    for key in keys_to_process:
        # Convert numpy arrays to lists
        batch[key] = [str(x) for x in batch[key]]
    return batch

In [None]:
# Apply the function to the dataset. This takes the longest time
flat_ds = flat_ds.map(array_to_string, batched = True)

## Read the data from the database

So it's easier to access the data in case the kernel crashes and had to re-run the codes again

In [23]:
# Read the table
df = pd.read_sql("SELECT * FROM climate_policy_radar", engine)
df.head()


Unnamed: 0,document_id,document_metadata.collection_summary,document_metadata.collection_title,document_metadata.corpus_type_name,document_metadata.corpus_import_id,document_metadata.category,document_metadata.description,document_metadata.document_title,document_metadata.family_import_id,document_metadata.family_slug,...,pipeline_metadata.parser_metadata.azure_model_id,pipeline_metadata.parser_metadata.parsing_date,text_block.text_block_id,text_block.language,text_block.type,text_block.type_confidence,text_block.coords,text_block.page_number,text_block.text,text_block.index
0,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,0,en,title,1.0,"{{70.452,123.7392},{524.1816,123.7392},{524.18...",0.0,Draft of the National Energy and Climate Plan ...,0
1,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,1,en,Text,1.0,"{{69.7176,208.4256},{124.21440000000001,208.79...",0.0,July 2021,1
2,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,2,en,Text,1.0,"{{217.1952,685.1376},{286.1856,685.1376},{286....",0.0,REPUBLIKA SHOIPERISE,2
3,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,3,en,Text,1.0,"{{195.2928,690.6096},{308.448,690.6096},{308.4...",0.0,MINISTRIA E TURIZMIT DHE MJEDISIT,3
4,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,4,en,Text,1.0,"{{76.9968,708.5015999999999},{182.88,708.1344}...",0.0,MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE,4


# 2. Embeddings generation

## 2.1 Load climateBERT

In [11]:
EMBEDDING_MODEL_LOCAL_DIR = os.getenv('EMBEDDING_MODEL_LOCAL_DIR')
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

In [12]:
# Download
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)
model = AutoModelForMaskedLM.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)

# Save it to a  local_models folder
tokenizer.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)



In [34]:
# Load the embedding model
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model = AutoModel.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaModel were not initialized from the model checkpoint at local_model/climatebert/distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Checking existing documents' country


In [29]:
query = """
SELECT DISTINCT "document_metadata.geographies"
FROM climate_policy_radar
WHERE "document_metadata.geographies" IS NOT NULL;
"""

geos = pd.read_sql(query, engine)
print(geos)


   document_metadata.geographies
0                          {SRB}
1                          {MKD}
2                          {GBR}
3                          {TUV}
4                          {FRA}
5                          {ALB}
6                          {EUR}
7                          {MNE}
8                          {AZE}
9                          {CAN}
10                         {JPN}
11                         {BRA}
12                         {DEU}
13                         {XKX}
14                         {CHN}
15                         {ZAF}
16                         {BIH}
17                         {IRL}


### Attempt to select only documents by Albania
Testing using Albania for now - This takes a really long time

In [None]:
# This one worked
from tqdm import tqdm

geos = df["document_metadata.geographies"]
filter = ["ALB" in str(x) for x in tqdm(geos, desc="Filtering for ALB")]
alb_chunks = df[filter]

Filtering for ALB: 100%|██████████| 100000/100000 [00:00<00:00, 173917.21it/s]


In [32]:
alb_chunks[["document_id", "document_metadata.geographies", "text_block.text"]].head(10)

Unnamed: 0,document_id,document_metadata.geographies,text_block.text
0,CCLW.document.i00000002.n0000,{ALB},Draft of the National Energy and Climate Plan ...
1,CCLW.document.i00000002.n0000,{ALB},July 2021
2,CCLW.document.i00000002.n0000,{ALB},REPUBLIKA SHOIPERISE
3,CCLW.document.i00000002.n0000,{ALB},MINISTRIA E TURIZMIT DHE MJEDISIT
4,CCLW.document.i00000002.n0000,{ALB},MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE
5,CCLW.document.i00000002.n0000,{ALB},german cooperation DEUTSCHE ZUSAMMENARBEIT
6,CCLW.document.i00000002.n0000,{ALB},Implemented by giz
7,CCLW.document.i00000002.n0000,{ALB},Deutsche Gesellschaft Für Internationale Zusam...
8,CCLW.document.i00000002.n0000,{ALB},Responsible for this document: Ministry of Inf...
9,CCLW.document.i00000002.n0000,{ALB},Purpose of this document: Submission to Energy...


In [None]:
#took too long for me
#DOUBLE CHECK LATER

filter = flat_ds["document_metadata.geographies"].astype(str).progress_apply(
    lambda x: bool(re.search(r"ALB", x))
)
alb_chunks = flat_ds[filter]

## 2.2 Save embeddings in csv
Embeddings generated in batches of 1000 chunks.
Note: obsolete, just ignore

In [None]:
# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Process embeddings in batches of 1000
batch_size = 1000
all_batches = (len(flat_ds) + batch_size - 1) // batch_size  # Calculate the number of batches
num_batches = 2

all_embeddings = []

for i in tqdm(range(num_batches)):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(flat_ds))
    
    # Generate embeddings for the current batch
    batch_embeddings = flat_ds[start_idx:end_idx]["text_block.text"].progress_apply(
        lambda text: generate_embeddings_for_text(text, model, tokenizer)
    )
    
    all_embeddings.extend(batch_embeddings)

# Create a DataFrame for all embeddings
embeddings_df = pd.DataFrame({
    "document_id": flat_ds[:num_batches*1000]["document_id"],
    "embeddings": all_embeddings
})

# Save the DataFrame to a single CSV file
embeddings_df.to_csv("data/embeddings.csv", index=False)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Use DPR for question answering, using chunks["text_block.text"] as context.

In [None]:
from tqdm import tqdm
import os
import pandas as pd

# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Use the filtered dataset, can add more columns if need to, for now only texts and doc_ids are present
texts = alb_chunks["text_block.text"]
doc_ids = alb_chunks["document_id"]

# Batch processing setup
batch_size = 1000
num_batches = (len(texts) + batch_size - 1) // batch_size

all_embeddings = []
all_doc_ids = []

for i in tqdm(range(num_batches), desc="Generating Embeddings"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(texts))
    
    batch_texts = texts.iloc[start_idx:end_idx]
    batch_ids = doc_ids.iloc[start_idx:end_idx]
    
    batch_embeddings = batch_texts.progress_apply(
        lambda text: generate_embeddings_for_text(text, model, tokenizer)
    )
    
    all_embeddings.extend(batch_embeddings)
    all_doc_ids.extend(batch_ids)

# Create a DataFrame for all embeddings
embeddings_df = pd.DataFrame({
    "document_id": all_doc_ids,
    "embedding": all_embeddings
})

# Save to CSV
embeddings_df.to_csv("data/albania_embeddings.csv", index=False)


Generating Embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:   7%|▋         | 1/14 [00:37<08:07, 37.52s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  14%|█▍        | 2/14 [01:22<08:22, 41.87s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  21%|██▏       | 3/14 [01:51<06:37, 36.13s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  29%|██▊       | 4/14 [02:19<05:27, 32.79s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  36%|███▌      | 5/14 [02:44<04:29, 29.98s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  43%|████▎     | 6/14 [03:10<03:49, 28.74s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  50%|█████     | 7/14 [03:33<03:06, 26.63s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  57%|█████▋    | 8/14 [03:53<02:28, 24.78s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  64%|██████▍   | 9/14 [04:16<02:00, 24.18s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  71%|███████▏  | 10/14 [04:37<01:32, 23.09s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  79%|███████▊  | 11/14 [04:58<01:07, 22.45s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  86%|████████▌ | 12/14 [05:19<00:43, 21.98s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  93%|█████████▎| 13/14 [05:39<00:21, 21.55s/it]

  0%|          | 0/797 [00:00<?, ?it/s]

Generating Embeddings: 100%|██████████| 14/14 [06:04<00:00, 26.02s/it]


### Embedding all existing documents

Now that I've tested with Albania and it worked, I'll generate embeddings with all documents and save them into csv files first. I will then save them into the database.

This block of code takes a long time. It's a replication of the code for Albania but ensures each csv file has the country name and embeddings for that specific country is saved into their respective csv file.

In [40]:
# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Use the filtered dataset
texts = alb_chunks["text_block.text"]
doc_ids = alb_chunks["document_id"]

# Batch processing setup
batch_size = 1000
num_batches = (len(texts) + batch_size - 1) // batch_size

all_embeddings = []
all_doc_ids = []

for i in tqdm(range(num_batches), desc="Generating Embeddings"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(texts))
    
    # Extract batch slices
    batch_texts = texts.iloc[start_idx:end_idx].tolist()
    batch_ids = doc_ids.iloc[start_idx:end_idx].tolist()
    
    # Call the embedding function on the whole batch
    batch_embeddings = generate_embeddings_for_text(batch_texts, model, tokenizer)
    
    all_embeddings.extend(batch_embeddings)
    all_doc_ids.extend(batch_ids)

# Create DataFrame and save to CSV
embeddings_df = pd.DataFrame({
    "document_id": all_doc_ids,
    "embedding": all_embeddings
})

embeddings_df.to_csv("data/albania_embeddings.csv", index=False)


Generating Embeddings:   0%|          | 0/14 [12:48<?, ?it/s]


KeyboardInterrupt: 

# 3. Saving the embeddings into the database