# 1. Set up the Huggingface Climate Policy Radar dataset.

In [15]:
import os
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from datasets import load_dataset
from functions import generate_embeddings_for_text

In [16]:


# Login using e.g. `huggingface-cli login` to access this dataset

ds = load_dataset("ClimatePolicyRadar/all-document-text-data")

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/42 [00:00<?, ?it/s]

In [17]:
ds = ds.with_format("pandas")

In [18]:
chunks = ds["train"]

In [19]:
chunks[0]

Unnamed: 0,document_id,document_metadata.collection_summary,document_metadata.collection_title,document_metadata.corpus_type_name,document_metadata.corpus_import_id,document_metadata.category,document_metadata.description,document_metadata.document_title,document_metadata.family_import_id,document_metadata.family_slug,...,_html_data.has_valid_text,pipeline_metadata.parser_metadata,text_block.text_block_id,text_block.language,text_block.type,text_block.type_confidence,text_block.coords,text_block.page_number,text_block.text,text_block.index
0,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,,"{'azure_api_version': '2023-07-31', 'azure_mod...",0,en,title,1.0,"[[70.452, 123.7392], [524.1816, 123.7392], [52...",0,Draft of the National Energy and Climate Plan ...,0


In [20]:
chunks[:10]["text_block.text"]

0    Draft of the National Energy and Climate Plan ...
1                                            July 2021
2                                 REPUBLIKA SHOIPERISE
3                    MINISTRIA E TURIZMIT DHE MJEDISIT
4            MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE
5           german cooperation DEUTSCHE ZUSAMMENARBEIT
6                                   Implemented by giz
7    Deutsche Gesellschaft Für Internationale Zusam...
8    Responsible for this document: Ministry of Inf...
9    Purpose of this document: Submission to Energy...
Name: text_block.text, dtype: object

In [21]:
chunks

Dataset({
    features: ['document_id', 'document_metadata.collection_summary', 'document_metadata.collection_title', 'document_metadata.corpus_type_name', 'document_metadata.corpus_import_id', 'document_metadata.category', 'document_metadata.description', 'document_metadata.document_title', 'document_metadata.family_import_id', 'document_metadata.family_slug', 'document_metadata.geographies', 'document_metadata.import_id', 'document_metadata.languages', 'document_metadata.metadata', 'document_metadata.family_title', 'document_metadata.publication_ts', 'document_metadata.slug', 'document_metadata.source', 'document_metadata.source_url', 'document_metadata.type', 'document_cdn_object', 'document_content_type', 'document_md5_sum', 'languages', 'document_metadata.translated', 'pdf_data_page_metadata.dimensions', '_html_data.detected_title', '_html_data.detected_date', '_html_data.has_valid_text', 'pipeline_metadata.parser_metadata', 'text_block.text_block_id', 'text_block.language', 'text

In [22]:
chunks[:10000]["text_block.text"].apply(len).median()

6.0

# Vector search

In [23]:
from dotenv import load_dotenv

load_dotenv()

True

In [24]:
EMBEDDING_MODEL_LOCAL_DIR = os.getenv('EMBEDDING_MODEL_LOCAL_DIR')
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

In [25]:
# Download
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)
model = AutoModelForMaskedLM.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)

# Save it to a  local_models folder
tokenizer.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)



In [26]:
# Load the embedding model
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model = AutoModel.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaModel were not initialized from the model checkpoint at local_model/climatebert/distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
def contains_alb(example):
    geos = example["document_metadata.geographies"]
    return geos is not None and "ALB" in geos

alb_chunks = chunks.filter(contains_alb)


Filter:   0%|          | 0/34185184 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [35]:
tqdm.pandas()

# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Process embeddings in batches of 1000
batch_size = 1000
all_batches = (len(chunks) + batch_size - 1) // batch_size  # Calculate the number of batches
num_batches = 2

all_embeddings = []

for i in tqdm(range(num_batches)):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(chunks))
    
    # Generate embeddings for the current batch
    batch_embeddings = chunks[start_idx:end_idx]["text_block.text"].progress_apply(
        lambda text: generate_embeddings_for_text(text, model, tokenizer)
    )
    
    all_embeddings.extend(batch_embeddings)

# Create a DataFrame for all embeddings
embeddings_df = pd.DataFrame({
    "document_id": chunks[:num_batches*1000]["document_id"],
    "embeddings": all_embeddings
})

# Save the DataFrame to a single CSV file
embeddings_df.to_csv("data/embeddings.csv", index=False)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Use DPR for question answering, using chunks["text_block.text"] as context.