# 1. Set up the Huggingface Climate Policy Radar dataset.

In [43]:
import os
import regex as re
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import pgai
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from datasets import load_dataset, Features, Value
from functions import generate_embeddings_for_text
import glob

tqdm.pandas()

In [2]:
# Login using e.g. `huggingface-cli login` in command line to access this dataset

ds = load_dataset("ClimatePolicyRadar/all-document-text-data")
ds = ds["train"]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/42 [00:00<?, ?it/s]

In [3]:
ds.features

{'document_id': Value(dtype='string', id=None),
 'document_metadata.collection_summary': Value(dtype='string', id=None),
 'document_metadata.collection_title': Value(dtype='string', id=None),
 'document_metadata.corpus_type_name': Value(dtype='string', id=None),
 'document_metadata.corpus_import_id': Value(dtype='string', id=None),
 'document_metadata.category': Value(dtype='string', id=None),
 'document_metadata.description': Value(dtype='string', id=None),
 'document_metadata.document_title': Value(dtype='string', id=None),
 'document_metadata.family_import_id': Value(dtype='string', id=None),
 'document_metadata.family_slug': Value(dtype='string', id=None),
 'document_metadata.geographies': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.import_id': Value(dtype='string', id=None),
 'document_metadata.languages': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.metadata': {'author': Sequence(feature=Va

In [4]:
flat_ds = ds.flatten()

In [5]:
for key in flat_ds.features.keys():
    print(type(flat_ds[18:19][key][0]))

<class 'str'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'bool'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'list'>
<class 'int'>
<class 'str'>
<class 'int'>


## Save 100000 chunks in Postgres

In [None]:
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os

load_dotenv # This loads the .env file into os.environ
batch_df = pd.DataFrame(flat_ds[:100000])
# Set up the database connection using SQLAlchemy
engine = create_engine(os.getenv("DB_URL"))

# Write the pandas DataFrame to the PostgreSQL database
batch_df.to_sql('climate_policy_radar', engine, if_exists='replace', index=False)

615

## Attempt to change arrays to strings for entire dataset

Just keeping this here for future reference. I don't think it's needed

In [21]:
def array_to_string(batch):
    keys_to_process = [
        "document_metadata.geographies",
        "document_metadata.languages",
        "languages",
        "pdf_data_page_metadata.dimensions",
        "text_block.coords"
    ]
    for key in keys_to_process:
        # Convert numpy arrays to lists
        batch[key] = [str(x) for x in batch[key]]
    return batch

In [None]:
# Apply the function to the dataset. This takes the longest time
flat_ds = flat_ds.map(array_to_string, batched = True)

## Read the data from the database

So it's easier to access the data in case the kernel crashes and had to re-run the codes again

In [7]:
# Read the table
df = pd.read_sql("SELECT * FROM climate_policy_radar", engine)
df.head()


Unnamed: 0,document_id,document_metadata.collection_summary,document_metadata.collection_title,document_metadata.corpus_type_name,document_metadata.corpus_import_id,document_metadata.category,document_metadata.description,document_metadata.document_title,document_metadata.family_import_id,document_metadata.family_slug,...,pipeline_metadata.parser_metadata.azure_model_id,pipeline_metadata.parser_metadata.parsing_date,text_block.text_block_id,text_block.language,text_block.type,text_block.type_confidence,text_block.coords,text_block.page_number,text_block.text,text_block.index
0,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,0,en,title,1.0,"{{70.452,123.7392},{524.1816,123.7392},{524.18...",0.0,Draft of the National Energy and Climate Plan ...,0
1,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,1,en,Text,1.0,"{{69.7176,208.4256},{124.21440000000001,208.79...",0.0,July 2021,1
2,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,2,en,Text,1.0,"{{217.1952,685.1376},{286.1856,685.1376},{286....",0.0,REPUBLIKA SHOIPERISE,2
3,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,3,en,Text,1.0,"{{195.2928,690.6096},{308.448,690.6096},{308.4...",0.0,MINISTRIA E TURIZMIT DHE MJEDISIT,3
4,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,4,en,Text,1.0,"{{76.9968,708.5015999999999},{182.88,708.1344}...",0.0,MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE,4


# 2. Embeddings generation

## 2.1 Load climateBERT

In [8]:
EMBEDDING_MODEL_LOCAL_DIR = os.getenv('EMBEDDING_MODEL_LOCAL_DIR')
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

In [None]:
# Download
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)
model = AutoModelForMaskedLM.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)

# Save it to a  local_models folder
tokenizer.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)



In [9]:
# Load the embedding model
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model = AutoModel.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaModel were not initialized from the model checkpoint at local_model/climatebert/distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Checking existing documents' country


In [10]:
query = """
SELECT DISTINCT "document_metadata.geographies"
FROM climate_policy_radar
WHERE "document_metadata.geographies" IS NOT NULL;
"""

geos = pd.read_sql(query, engine)
print(geos)


   document_metadata.geographies
0                          {SRB}
1                          {MKD}
2                          {GBR}
3                          {TUV}
4                          {FRA}
5                          {ALB}
6                          {EUR}
7                          {MNE}
8                          {AZE}
9                          {CAN}
10                         {JPN}
11                         {BRA}
12                         {DEU}
13                         {XKX}
14                         {CHN}
15                         {ZAF}
16                         {BIH}
17                         {IRL}


### Attempt to select only documents by Albania
Testing using Albania for now - This takes a really long time

In [11]:
# This one worked
from tqdm import tqdm

geos = df["document_metadata.geographies"]
filter = ["ALB" in str(x) for x in tqdm(geos, desc="Filtering for ALB")]
alb_chunks = df[filter]

Filtering for ALB: 100%|██████████| 100000/100000 [00:00<00:00, 894725.68it/s]


In [12]:
alb_chunks[["document_id", "document_metadata.geographies", "text_block.text"]].head(10)

Unnamed: 0,document_id,document_metadata.geographies,text_block.text
0,CCLW.document.i00000002.n0000,{ALB},Draft of the National Energy and Climate Plan ...
1,CCLW.document.i00000002.n0000,{ALB},July 2021
2,CCLW.document.i00000002.n0000,{ALB},REPUBLIKA SHOIPERISE
3,CCLW.document.i00000002.n0000,{ALB},MINISTRIA E TURIZMIT DHE MJEDISIT
4,CCLW.document.i00000002.n0000,{ALB},MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE
5,CCLW.document.i00000002.n0000,{ALB},german cooperation DEUTSCHE ZUSAMMENARBEIT
6,CCLW.document.i00000002.n0000,{ALB},Implemented by giz
7,CCLW.document.i00000002.n0000,{ALB},Deutsche Gesellschaft Für Internationale Zusam...
8,CCLW.document.i00000002.n0000,{ALB},Responsible for this document: Ministry of Inf...
9,CCLW.document.i00000002.n0000,{ALB},Purpose of this document: Submission to Energy...


In [None]:
# #took too long for me
# #DOUBLE CHECK LATER

# filter = flat_ds["document_metadata.geographies"].astype(str).progress_apply(
#     lambda x: bool(re.search(r"ALB", x))
# )
# alb_chunks = flat_ds[filter]

## 2.2 Save embeddings in csv
Embeddings generated in batches of 1000 chunks.
Note: obsolete, just ignore

In [None]:
# # Ensure the "data" directory exists
# os.makedirs("data", exist_ok=True)

# # Process embeddings in batches of 1000
# batch_size = 1000
# all_batches = (len(flat_ds) + batch_size - 1) // batch_size  # Calculate the number of batches
# num_batches = 2

# all_embeddings = []

# for i in tqdm(range(num_batches)):
#     start_idx = i * batch_size
#     end_idx = min((i + 1) * batch_size, len(flat_ds))
    
#     # Generate embeddings for the current batch
#     batch_embeddings = flat_ds[start_idx:end_idx]["text_block.text"].progress_apply(
#         lambda text: generate_embeddings_for_text(text, model, tokenizer)
#     )
    
#     all_embeddings.extend(batch_embeddings)

# # Create a DataFrame for all embeddings
# embeddings_df = pd.DataFrame({
#     "document_id": flat_ds[:num_batches*1000]["document_id"],
#     "embeddings": all_embeddings
#     "country_code": flat_ds[:num_batches*1000]["document_metadata.geographies"].str.extract(r"\{(\w+)\}")[0],
# })

# # Save the DataFrame to a single CSV file
# embeddings_df.to_csv("data/embeddings.csv", index=False)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Use DPR for question answering, using chunks["text_block.text"] as context.

In [None]:

# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Use the filtered dataset, can add more columns if need to, for now only texts and doc_ids are present
texts = alb_chunks["text_block.text"]
doc_ids = alb_chunks["document_id"]

# Batch processing setup
batch_size = 1000
num_batches = (len(texts) + batch_size - 1) // batch_size

all_embeddings = []
all_doc_ids = []

for i in tqdm(range(num_batches), desc="Generating Embeddings"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(texts))
    
    batch_texts = texts.iloc[start_idx:end_idx]
    batch_ids = doc_ids.iloc[start_idx:end_idx]
    
    batch_embeddings = batch_texts.progress_apply(
        lambda text: generate_embeddings_for_text(text, model, tokenizer)
    )
    
    all_embeddings.extend(batch_embeddings)
    all_doc_ids.extend(batch_ids)

# Create a DataFrame for all embeddings
embeddings_df = pd.DataFrame({
    "document_id": all_doc_ids,
    "embedding": all_embeddings
})

# Save to CSV
embeddings_df.to_csv("data/albania_embeddings.csv", index=False)


Generating Embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:   7%|▋         | 1/14 [00:30<06:32, 30.18s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  14%|█▍        | 2/14 [01:06<06:46, 33.91s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  21%|██▏       | 3/14 [01:30<05:23, 29.40s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  29%|██▊       | 4/14 [01:56<04:38, 27.87s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  36%|███▌      | 5/14 [02:19<03:55, 26.22s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  43%|████▎     | 6/14 [02:42<03:19, 24.98s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  50%|█████     | 7/14 [03:01<02:42, 23.15s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  57%|█████▋    | 8/14 [03:25<02:19, 23.26s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  64%|██████▍   | 9/14 [03:50<01:59, 23.83s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  71%|███████▏  | 10/14 [04:12<01:33, 23.46s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  79%|███████▊  | 11/14 [04:33<01:07, 22.49s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  86%|████████▌ | 12/14 [04:53<00:44, 22.00s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  93%|█████████▎| 13/14 [05:14<00:21, 21.49s/it]

  0%|          | 0/797 [00:00<?, ?it/s]

Generating Embeddings: 100%|██████████| 14/14 [05:36<00:00, 24.01s/it]


### Embedding all documents for all countries

Now that I've tested with Albania and it worked, I'll generate embeddings with all documents and save them into csv files first. I will then save them into the database.

This block of code takes a long time. It's a replication of the code for Albania but ensures each csv file has the country name and embeddings for that specific country is saved into their respective csv file.

The rows that do not have string values are skipped (it should not be a problem because the column selected should always have a string value normally)

In [None]:
# Make sure column is string for filtering
df["document_metadata.geographies"] = df["document_metadata.geographies"].astype(str)

# Extract 3-letter codes like 'ALB', 'DEU', etc.
df["country_code"] = df["document_metadata.geographies"].str.extract(r"\{(\w+)\}")

# Get all unique codes
country_codes = df["country_code"].dropna().unique()

# Store each country chunk in a dictionary
country_chunks = {}

for code in tqdm(country_codes, desc="Filtering by country"):
    country_chunks[code] = df[df["country_code"] == code]

Filtering by country: 100%|██████████| 18/18 [00:01<00:00, 13.82it/s]


In [25]:
print(len(country_chunks))

#how many documents for each country
for code, chunk in country_chunks.items():
    print(f"{code}: {len(chunk)} documents")

18
ALB: 13797 documents
BIH: 16552 documents
ZAF: 11132 documents
FRA: 351 documents
MNE: 3152 documents
XKX: 5314 documents
MKD: 14146 documents
SRB: 9976 documents
EUR: 3840 documents
TUV: 4896 documents
BRA: 1067 documents
CAN: 91 documents
DEU: 126 documents
CHN: 644 documents
AZE: 7206 documents
JPN: 119 documents
IRL: 3 documents
GBR: 249 documents


In [None]:
# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Batch size for embedding
batch_size = 10000

# Loop through each country's data in the dictionary
for code, chunk in tqdm(country_chunks.items(), desc="Generating country embeddings"):
    texts = chunk["text_block.text"]
    doc_ids = chunk["document_id"]

    num_batches = (len(texts) + batch_size - 1) // batch_size

    all_embeddings = []
    all_doc_ids = []

    for i in tqdm(range(num_batches), desc=f"Embedding {code}", leave=False):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(texts))

        batch_texts = texts.iloc[start_idx:end_idx]
        batch_ids = doc_ids.iloc[start_idx:end_idx]

        # Reset index so they stay aligned after filtering
        batch_texts = batch_texts.reset_index(drop=True)
        batch_ids = batch_ids.reset_index(drop=True)

        # Filter out invalid texts
        mask = batch_texts.apply(lambda x: isinstance(x, str) and x.strip() != "")
        batch_texts = batch_texts[mask]
        batch_ids = batch_ids[mask]


        batch_embeddings = batch_texts.progress_apply(
            lambda text: generate_embeddings_for_text(text, model, tokenizer)
        )

        all_embeddings.extend(batch_embeddings)
        all_doc_ids.extend(batch_ids)

    # Save to DataFrame
    embeddings_df = pd.DataFrame({
        "document_id": all_doc_ids,
        "embedding": all_embeddings
    })

    # Save to CSV using the country code (e.g. alb_embeddings.csv)
    filename = f"data/{code.lower()}_embeddings.csv"
    embeddings_df.to_csv(filename, index=False)


Generating country embeddings:   0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/3797 [00:00<?, ?it/s]

Generating country embeddings:   6%|▌         | 1/18 [06:03<1:42:53, 363.15s/it]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/6552 [00:00<?, ?it/s]

Generating country embeddings:  11%|█         | 2/18 [14:03<1:55:17, 432.35s/it]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/1132 [00:00<?, ?it/s]

Generating country embeddings:  17%|█▋        | 3/18 [20:48<1:44:54, 419.62s/it]

  0%|          | 0/351 [00:00<?, ?it/s]

Generating country embeddings:  22%|██▏       | 4/18 [21:14<1:01:40, 264.31s/it]

  0%|          | 0/3148 [00:00<?, ?it/s]

Generating country embeddings:  28%|██▊       | 5/18 [24:15<50:46, 234.38s/it]  

  0%|          | 0/5314 [00:00<?, ?it/s]

Generating country embeddings:  33%|███▎      | 6/18 [27:45<45:12, 226.03s/it]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/4144 [00:00<?, ?it/s]

Generating country embeddings:  39%|███▉      | 7/18 [42:13<1:19:52, 435.69s/it]

  0%|          | 0/9974 [00:00<?, ?it/s]

Generating country embeddings:  44%|████▍     | 8/18 [53:55<1:26:44, 520.47s/it]

  0%|          | 0/3840 [00:00<?, ?it/s]

Generating country embeddings:  50%|█████     | 9/18 [57:09<1:02:47, 418.60s/it]

  0%|          | 0/4896 [00:00<?, ?it/s]

Generating country embeddings:  56%|█████▌    | 10/18 [59:07<43:25, 325.67s/it] 

  0%|          | 0/1065 [00:00<?, ?it/s]

Generating country embeddings:  61%|██████    | 11/18 [59:50<27:55, 239.32s/it]

  0%|          | 0/89 [00:00<?, ?it/s]

Generating country embeddings:  67%|██████▋   | 12/18 [59:55<16:47, 167.99s/it]

  0%|          | 0/126 [00:00<?, ?it/s]

Generating country embeddings:  72%|███████▏  | 13/18 [1:00:02<09:56, 119.28s/it]

  0%|          | 0/644 [00:00<?, ?it/s]

Generating country embeddings:  78%|███████▊  | 14/18 [1:01:03<06:46, 101.70s/it]

  0%|          | 0/7206 [00:00<?, ?it/s]

Generating country embeddings:  83%|████████▎ | 15/18 [1:05:45<07:48, 156.00s/it]

  0%|          | 0/119 [00:00<?, ?it/s]

Generating country embeddings:  89%|████████▉ | 16/18 [1:05:51<03:41, 110.77s/it]

  0%|          | 0/3 [00:00<?, ?it/s]

Generating country embeddings:  94%|█████████▍| 17/18 [1:05:51<01:17, 77.53s/it] 

  0%|          | 0/249 [00:00<?, ?it/s]

Generating country embeddings: 100%|██████████| 18/18 [1:06:00<00:00, 220.04s/it]


# 3. Saving the embeddings into the database

Now go to create_table.sql and run the query to create the table. Remember to select the Postgres Server at the bottom, and highlight the code and right click to run query.

In [69]:
# Make sure the dimentions of the embeddings are correct
df = pd.read_csv("data/alb_embeddings.csv")
print(len(eval(df["embedding"][0])))  # Check how many dimensions

#double checking how many embeddings each file has
csv_files = sorted(glob.glob("data/*_embeddings.csv"))
for file in csv_files:
    df=pd.read_csv(file)
    print(f"{os.path.basename(file)} has {len(df)} embeddings")


768
alb_embeddings.csv has 13797 embeddings
aze_embeddings.csv has 7206 embeddings
bih_embeddings.csv has 16552 embeddings
bra_embeddings.csv has 1065 embeddings
can_embeddings.csv has 89 embeddings
chn_embeddings.csv has 644 embeddings
deu_embeddings.csv has 126 embeddings
eur_embeddings.csv has 3840 embeddings
fra_embeddings.csv has 351 embeddings
gbr_embeddings.csv has 249 embeddings
irl_embeddings.csv has 3 embeddings
jpn_embeddings.csv has 119 embeddings
mkd_embeddings.csv has 14144 embeddings
mne_embeddings.csv has 3148 embeddings
srb_embeddings.csv has 9974 embeddings
tuv_embeddings.csv has 4896 embeddings
xkx_embeddings.csv has 5314 embeddings
zaf_embeddings.csv has 11132 embeddings


In [68]:
import glob
import ast
from sqlalchemy.orm import sessionmaker

# Connect to DB
load_dotenv()
engine = create_engine(os.getenv("DB_URL"))
Session = sessionmaker(bind=engine)
session = Session()

# Find all *_embeddings.csv files
csv_files = sorted(glob.glob("data/*_embeddings.csv"))

for file in csv_files:
    country_code = os.path.basename(file)[:3].upper()
    df = pd.read_csv(file)
    df["country_code"] = country_code

    # Parse embeddings from string to list[float]
    df["embedding"] = df["embedding"].apply(
        lambda x: ast.literal_eval(x.strip()) if isinstance(x, str) else x
    )

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Uploading {country_code}"):
        if not row["embedding"]:
            continue

        stmt = text("""
            INSERT INTO document_embeddings (document_id, country_code, embedding)
            VALUES (:document_id, :country_code, :embedding)
        """)

        session.execute(stmt, {
            "document_id": row["document_id"],
            "country_code": row["country_code"],
            "embedding": row["embedding"]
        })

    session.commit()

print("✅ All embeddings uploaded into document_embeddings table.")


Uploading ALB:   0%|          | 0/13797 [00:00<?, ?it/s]

Uploading AZE:   0%|          | 0/7206 [00:00<?, ?it/s]

Uploading BIH:   0%|          | 0/16552 [00:00<?, ?it/s]

Uploading BRA:   0%|          | 0/1065 [00:00<?, ?it/s]

Uploading CAN:   0%|          | 0/89 [00:00<?, ?it/s]

Uploading CHN:   0%|          | 0/644 [00:00<?, ?it/s]

Uploading DEU:   0%|          | 0/126 [00:00<?, ?it/s]

Uploading EUR:   0%|          | 0/3840 [00:00<?, ?it/s]

Uploading FRA:   0%|          | 0/351 [00:00<?, ?it/s]

Uploading GBR:   0%|          | 0/249 [00:00<?, ?it/s]

Uploading IRL:   0%|          | 0/3 [00:00<?, ?it/s]

Uploading JPN:   0%|          | 0/119 [00:00<?, ?it/s]

Uploading MKD:   0%|          | 0/14144 [00:00<?, ?it/s]

Uploading MNE:   0%|          | 0/3148 [00:00<?, ?it/s]

Uploading SRB:   0%|          | 0/9974 [00:00<?, ?it/s]

Uploading TUV:   0%|          | 0/4896 [00:00<?, ?it/s]

Uploading XKX:   0%|          | 0/5314 [00:00<?, ?it/s]

Uploading ZAF:   0%|          | 0/11132 [00:00<?, ?it/s]

✅ All embeddings uploaded into document_embeddings table.
