# 1. Set up the Huggingface Climate Policy Radar dataset.

In [43]:
import os
import regex as re
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import pgai
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from datasets import load_dataset, Features, Value
from functions import generate_embeddings_for_text
import glob

tqdm.pandas()

In [2]:
# Login using e.g. `huggingface-cli login` in command line to access this dataset

ds = load_dataset("ClimatePolicyRadar/all-document-text-data")
ds = ds["train"]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/42 [00:00<?, ?it/s]

In [3]:
ds.features

{'document_id': Value(dtype='string', id=None),
 'document_metadata.collection_summary': Value(dtype='string', id=None),
 'document_metadata.collection_title': Value(dtype='string', id=None),
 'document_metadata.corpus_type_name': Value(dtype='string', id=None),
 'document_metadata.corpus_import_id': Value(dtype='string', id=None),
 'document_metadata.category': Value(dtype='string', id=None),
 'document_metadata.description': Value(dtype='string', id=None),
 'document_metadata.document_title': Value(dtype='string', id=None),
 'document_metadata.family_import_id': Value(dtype='string', id=None),
 'document_metadata.family_slug': Value(dtype='string', id=None),
 'document_metadata.geographies': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.import_id': Value(dtype='string', id=None),
 'document_metadata.languages': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'document_metadata.metadata': {'author': Sequence(feature=Va

In [4]:
flat_ds = ds.flatten()

In [5]:
for key in flat_ds.features.keys():
    print(type(flat_ds[18:19][key][0]))

<class 'str'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'list'>
<class 'bool'>
<class 'list'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'list'>
<class 'int'>
<class 'str'>
<class 'int'>


## Save 100000 chunks in Postgres

In [None]:
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os

load_dotenv # This loads the .env file into os.environ
batch_df = pd.DataFrame(flat_ds[:100000])
# Set up the database connection using SQLAlchemy
engine = create_engine(os.getenv("DB_URL"))

# Write the pandas DataFrame to the PostgreSQL database
batch_df.to_sql('climate_policy_radar', engine, if_exists='replace', index=False)

615

## Attempt to change arrays to strings for entire dataset

Just keeping this here for future reference. I don't think it's needed

In [21]:
def array_to_string(batch):
    keys_to_process = [
        "document_metadata.geographies",
        "document_metadata.languages",
        "languages",
        "pdf_data_page_metadata.dimensions",
        "text_block.coords"
    ]
    for key in keys_to_process:
        # Convert numpy arrays to lists
        batch[key] = [str(x) for x in batch[key]]
    return batch

In [None]:
# Apply the function to the dataset. This takes the longest time
flat_ds = flat_ds.map(array_to_string, batched = True)

## Read the data from the database

So it's easier to access the data in case the kernel crashes and had to re-run the codes again

In [7]:
# Read the table
df = pd.read_sql("SELECT * FROM climate_policy_radar", engine)
df.head()


Unnamed: 0,document_id,document_metadata.collection_summary,document_metadata.collection_title,document_metadata.corpus_type_name,document_metadata.corpus_import_id,document_metadata.category,document_metadata.description,document_metadata.document_title,document_metadata.family_import_id,document_metadata.family_slug,...,pipeline_metadata.parser_metadata.azure_model_id,pipeline_metadata.parser_metadata.parsing_date,text_block.text_block_id,text_block.language,text_block.type,text_block.type_confidence,text_block.coords,text_block.page_number,text_block.text,text_block.index
0,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,0,en,title,1.0,"{{70.452,123.7392},{524.1816,123.7392},{524.18...",0.0,Draft of the National Energy and Climate Plan ...,0
1,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,1,en,Text,1.0,"{{69.7176,208.4256},{124.21440000000001,208.79...",0.0,July 2021,1
2,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,2,en,Text,1.0,"{{217.1952,685.1376},{286.1856,685.1376},{286....",0.0,REPUBLIKA SHOIPERISE,2
3,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,3,en,Text,1.0,"{{195.2928,690.6096},{308.448,690.6096},{308.4...",0.0,MINISTRIA E TURIZMIT DHE MJEDISIT,3
4,CCLW.document.i00000002.n0000,,,Laws and Policies,CCLW.corpus.i00000001.n0000,Executive,"<p><span style=""font-size: 10pt;font-family: A...",National Energy and Climate Plan 2019 Draft,CCLW.family.i00000001.n0000,national-energy-and-climate-plan_8a4f,...,prebuilt-document,2023-12-11T11:43:23.509480,4,en,Text,1.0,"{{76.9968,708.5015999999999},{182.88,708.1344}...",0.0,MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE,4


# 2. Embeddings generation

## 2.1 Load climateBERT

In [8]:
EMBEDDING_MODEL_LOCAL_DIR = os.getenv('EMBEDDING_MODEL_LOCAL_DIR')
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

In [None]:
# Download
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)
model = AutoModelForMaskedLM.from_pretrained(EMBEDDING_MODEL, use_auth_token=False)

# Save it to a  local_models folder
tokenizer.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model.save_pretrained(EMBEDDING_MODEL_LOCAL_DIR)



In [9]:
# Load the embedding model
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
model = AutoModel.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaModel were not initialized from the model checkpoint at local_model/climatebert/distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Checking existing documents' country


In [10]:
query = """
SELECT DISTINCT "document_metadata.geographies"
FROM climate_policy_radar
WHERE "document_metadata.geographies" IS NOT NULL;
"""

geos = pd.read_sql(query, engine)
print(geos)


   document_metadata.geographies
0                          {SRB}
1                          {MKD}
2                          {GBR}
3                          {TUV}
4                          {FRA}
5                          {ALB}
6                          {EUR}
7                          {MNE}
8                          {AZE}
9                          {CAN}
10                         {JPN}
11                         {BRA}
12                         {DEU}
13                         {XKX}
14                         {CHN}
15                         {ZAF}
16                         {BIH}
17                         {IRL}


### Attempt to select only documents by Albania
Testing using Albania for now - This takes a really long time

In [11]:
# This one worked
from tqdm import tqdm

geos = df["document_metadata.geographies"]
filter = ["ALB" in str(x) for x in tqdm(geos, desc="Filtering for ALB")]
alb_chunks = df[filter]

Filtering for ALB: 100%|██████████| 100000/100000 [00:00<00:00, 894725.68it/s]


In [12]:
alb_chunks[["document_id", "document_metadata.geographies", "text_block.text"]].head(10)

Unnamed: 0,document_id,document_metadata.geographies,text_block.text
0,CCLW.document.i00000002.n0000,{ALB},Draft of the National Energy and Climate Plan ...
1,CCLW.document.i00000002.n0000,{ALB},July 2021
2,CCLW.document.i00000002.n0000,{ALB},REPUBLIKA SHOIPERISE
3,CCLW.document.i00000002.n0000,{ALB},MINISTRIA E TURIZMIT DHE MJEDISIT
4,CCLW.document.i00000002.n0000,{ALB},MINISTRIA E INFRASTRUKTURĒS DHE ENERGJISE
5,CCLW.document.i00000002.n0000,{ALB},german cooperation DEUTSCHE ZUSAMMENARBEIT
6,CCLW.document.i00000002.n0000,{ALB},Implemented by giz
7,CCLW.document.i00000002.n0000,{ALB},Deutsche Gesellschaft Für Internationale Zusam...
8,CCLW.document.i00000002.n0000,{ALB},Responsible for this document: Ministry of Inf...
9,CCLW.document.i00000002.n0000,{ALB},Purpose of this document: Submission to Energy...


In [None]:
# #took too long for me
# #DOUBLE CHECK LATER

# filter = flat_ds["document_metadata.geographies"].astype(str).progress_apply(
#     lambda x: bool(re.search(r"ALB", x))
# )
# alb_chunks = flat_ds[filter]

## 2.2 Save embeddings in csv
Embeddings generated in batches of 1000 chunks.
Note: obsolete, just ignore

In [None]:
# # Ensure the "data" directory exists
# os.makedirs("data", exist_ok=True)

# # Process embeddings in batches of 1000
# batch_size = 1000
# all_batches = (len(flat_ds) + batch_size - 1) // batch_size  # Calculate the number of batches
# num_batches = 2

# all_embeddings = []

# for i in tqdm(range(num_batches)):
#     start_idx = i * batch_size
#     end_idx = min((i + 1) * batch_size, len(flat_ds))
    
#     # Generate embeddings for the current batch
#     batch_embeddings = flat_ds[start_idx:end_idx]["text_block.text"].progress_apply(
#         lambda text: generate_embeddings_for_text(text, model, tokenizer)
#     )
    
#     all_embeddings.extend(batch_embeddings)

# # Create a DataFrame for all embeddings
# embeddings_df = pd.DataFrame({
#     "document_id": flat_ds[:num_batches*1000]["document_id"],
#     "embeddings": all_embeddings
#     "country_code": flat_ds[:num_batches*1000]["document_metadata.geographies"].str.extract(r"\{(\w+)\}")[0],
# })

# # Save the DataFrame to a single CSV file
# embeddings_df.to_csv("data/embeddings.csv", index=False)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Use DPR for question answering, using chunks["text_block.text"] as context.

In [None]:

# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Use the filtered dataset, can add more columns if need to, for now only texts and doc_ids are present
texts = alb_chunks["text_block.text"]
doc_ids = alb_chunks["document_id"]

# Batch processing setup
batch_size = 1000
num_batches = (len(texts) + batch_size - 1) // batch_size

all_embeddings = []
all_doc_ids = []

for i in tqdm(range(num_batches), desc="Generating Embeddings"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(texts))
    
    batch_texts = texts.iloc[start_idx:end_idx]
    batch_ids = doc_ids.iloc[start_idx:end_idx]
    
    batch_embeddings = batch_texts.progress_apply(
        lambda text: generate_embeddings_for_text(text, model, tokenizer)
    )
    
    all_embeddings.extend(batch_embeddings)
    all_doc_ids.extend(batch_ids)

# Create a DataFrame for all embeddings
embeddings_df = pd.DataFrame({
    "document_id": all_doc_ids,
    "embedding": all_embeddings
})

# Save to CSV
embeddings_df.to_csv("data/albania_embeddings.csv", index=False)


Generating Embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:   7%|▋         | 1/14 [00:30<06:32, 30.18s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  14%|█▍        | 2/14 [01:06<06:46, 33.91s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  21%|██▏       | 3/14 [01:30<05:23, 29.40s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  29%|██▊       | 4/14 [01:56<04:38, 27.87s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  36%|███▌      | 5/14 [02:19<03:55, 26.22s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  43%|████▎     | 6/14 [02:42<03:19, 24.98s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  50%|█████     | 7/14 [03:01<02:42, 23.15s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  57%|█████▋    | 8/14 [03:25<02:19, 23.26s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  64%|██████▍   | 9/14 [03:50<01:59, 23.83s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  71%|███████▏  | 10/14 [04:12<01:33, 23.46s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  79%|███████▊  | 11/14 [04:33<01:07, 22.49s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  86%|████████▌ | 12/14 [04:53<00:44, 22.00s/it]

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating Embeddings:  93%|█████████▎| 13/14 [05:14<00:21, 21.49s/it]

  0%|          | 0/797 [00:00<?, ?it/s]

Generating Embeddings: 100%|██████████| 14/14 [05:36<00:00, 24.01s/it]


### Embedding all documents for all countries

Now that I've tested with Albania and it worked, I'll generate embeddings with all documents and save them into csv files first. I will then save them into the database.

This block of code takes a long time. It's a replication of the code for Albania but ensures each csv file has the country name and embeddings for that specific country is saved into their respective csv file.

The rows that do not have string values are skipped (it should not be a problem because the column selected should always have a string value normally)

In [None]:
# Make sure column is string for filtering
df["document_metadata.geographies"] = df["document_metadata.geographies"].astype(str)

# Extract 3-letter codes like 'ALB', 'DEU', etc.
df["country_code"] = df["document_metadata.geographies"].str.extract(r"\{(\w+)\}")

# Get all unique codes
country_codes = df["country_code"].dropna().unique()

# Store each country chunk in a dictionary
country_chunks = {}

for code in tqdm(country_codes, desc="Filtering by country"):
    country_chunks[code] = df[df["country_code"] == code]

Filtering by country: 100%|██████████| 18/18 [00:01<00:00, 13.82it/s]


In [25]:
print(len(country_chunks))

#how many documents for each country
for code, chunk in country_chunks.items():
    print(f"{code}: {len(chunk)} documents")

18
ALB: 13797 documents
BIH: 16552 documents
ZAF: 11132 documents
FRA: 351 documents
MNE: 3152 documents
XKX: 5314 documents
MKD: 14146 documents
SRB: 9976 documents
EUR: 3840 documents
TUV: 4896 documents
BRA: 1067 documents
CAN: 91 documents
DEU: 126 documents
CHN: 644 documents
AZE: 7206 documents
JPN: 119 documents
IRL: 3 documents
GBR: 249 documents


In [None]:
# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Batch size for embedding
batch_size = 10000

# Loop through each country's data in the dictionary
for code, chunk in tqdm(country_chunks.items(), desc="Generating country embeddings"):
    texts = chunk["text_block.text"]
    doc_ids = chunk["document_id"]

    num_batches = (len(texts) + batch_size - 1) // batch_size

    all_embeddings = []
    all_doc_ids = []

    for i in tqdm(range(num_batches), desc=f"Embedding {code}", leave=False):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(texts))

        batch_texts = texts.iloc[start_idx:end_idx]
        batch_ids = doc_ids.iloc[start_idx:end_idx]

        # Reset index so they stay aligned after filtering
        batch_texts = batch_texts.reset_index(drop=True)
        batch_ids = batch_ids.reset_index(drop=True)

        # Filter out invalid texts
        mask = batch_texts.apply(lambda x: isinstance(x, str) and x.strip() != "")
        batch_texts = batch_texts[mask]
        batch_ids = batch_ids[mask]


        batch_embeddings = batch_texts.progress_apply(
            lambda text: generate_embeddings_for_text(text, model, tokenizer)
        )

        all_embeddings.extend(batch_embeddings)
        all_doc_ids.extend(batch_ids)

    # Save to DataFrame
    embeddings_df = pd.DataFrame({
        "document_id": all_doc_ids,
        "embedding": all_embeddings
    })

    # Save to CSV using the country code (e.g. alb_embeddings.csv)
    filename = f"data/{code.lower()}_embeddings.csv"
    embeddings_df.to_csv(filename, index=False)


Generating country embeddings:   0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/3797 [00:00<?, ?it/s]

Generating country embeddings:   6%|▌         | 1/18 [06:03<1:42:53, 363.15s/it]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/6552 [00:00<?, ?it/s]

Generating country embeddings:  11%|█         | 2/18 [14:03<1:55:17, 432.35s/it]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/1132 [00:00<?, ?it/s]

Generating country embeddings:  17%|█▋        | 3/18 [20:48<1:44:54, 419.62s/it]

  0%|          | 0/351 [00:00<?, ?it/s]

Generating country embeddings:  22%|██▏       | 4/18 [21:14<1:01:40, 264.31s/it]

  0%|          | 0/3148 [00:00<?, ?it/s]

Generating country embeddings:  28%|██▊       | 5/18 [24:15<50:46, 234.38s/it]  

  0%|          | 0/5314 [00:00<?, ?it/s]

Generating country embeddings:  33%|███▎      | 6/18 [27:45<45:12, 226.03s/it]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/4144 [00:00<?, ?it/s]

Generating country embeddings:  39%|███▉      | 7/18 [42:13<1:19:52, 435.69s/it]

  0%|          | 0/9974 [00:00<?, ?it/s]

Generating country embeddings:  44%|████▍     | 8/18 [53:55<1:26:44, 520.47s/it]

  0%|          | 0/3840 [00:00<?, ?it/s]

Generating country embeddings:  50%|█████     | 9/18 [57:09<1:02:47, 418.60s/it]

  0%|          | 0/4896 [00:00<?, ?it/s]

Generating country embeddings:  56%|█████▌    | 10/18 [59:07<43:25, 325.67s/it] 

  0%|          | 0/1065 [00:00<?, ?it/s]

Generating country embeddings:  61%|██████    | 11/18 [59:50<27:55, 239.32s/it]

  0%|          | 0/89 [00:00<?, ?it/s]

Generating country embeddings:  67%|██████▋   | 12/18 [59:55<16:47, 167.99s/it]

  0%|          | 0/126 [00:00<?, ?it/s]

Generating country embeddings:  72%|███████▏  | 13/18 [1:00:02<09:56, 119.28s/it]

  0%|          | 0/644 [00:00<?, ?it/s]

Generating country embeddings:  78%|███████▊  | 14/18 [1:01:03<06:46, 101.70s/it]

  0%|          | 0/7206 [00:00<?, ?it/s]

Generating country embeddings:  83%|████████▎ | 15/18 [1:05:45<07:48, 156.00s/it]

  0%|          | 0/119 [00:00<?, ?it/s]

Generating country embeddings:  89%|████████▉ | 16/18 [1:05:51<03:41, 110.77s/it]

  0%|          | 0/3 [00:00<?, ?it/s]

Generating country embeddings:  94%|█████████▍| 17/18 [1:05:51<01:17, 77.53s/it] 

  0%|          | 0/249 [00:00<?, ?it/s]

Generating country embeddings: 100%|██████████| 18/18 [1:06:00<00:00, 220.04s/it]


# 3. Saving the embeddings into the database

In [None]:
load_dotenv()
engine = create_engine(os.getenv("DB_URL"))

# Load and run SQL file
with open("create_table.sql", "r") as file:
    sql_commands = file.read()

with engine.begin() as conn:  # use begin() for implicit commit
    conn.execute(text(create_table_sql))

print("✅ Table 'document_embeddings' created (or already exists) in public schema.")

✅ Table 'document_embeddings' created (or already exists) in public schema.


In [None]:
# Make sure the dimentions of the embeddings are correct
df = pd.read_csv("data/alb_embeddings.csv")
print(len(eval(df["embedding"][0])))  # Check how many dimensions


768


In [53]:
# Find all CSV files like 'alb_embeddings.csv'
csv_files = glob.glob("data/*_embeddings.csv")

for file in csv_files:
    country_code = os.path.basename(file)[:3].upper()  # Get 'ALB', 'CHN', etc.

    # Load CSV
    df = pd.read_csv(file)

    # Add country_code column
    df["country_code"] = country_code

    # Convert embedding strings like "[0.1, 0.2]" → "ARRAY[0.1,0.2]"
    df["embedding"] = df["embedding"].apply(
        lambda x: f"ARRAY{x.strip('[]')}" if isinstance(x, str) else None
    )

    # Prepare and insert rows
    # with engine.begin() as conn:
    #     for _, row in df.iterrows():
    #         conn.execute(text("""
    #             INSERT INTO document_embeddings (document_id, country_code, embedding)
    #             VALUES (:document_id, :country_code, :embedding::vector)
    #             ON CONFLICT (document_id) DO NOTHING;
    #         """), {
    #             "document_id": row["document_id"],
    #             "country_code": row["country_code"],
    #             "embedding": row["embedding"]
    #         })
    
    from sqlalchemy import text

with engine.begin() as conn:
    for _, row in df.iterrows():
        if row["embedding"] is None:
            continue

        # Clean embedding to be valid ARRAY[...] format
        if isinstance(row["embedding"], str):
            # Ensure it's not double wrapped or malformed
            embedding_str = row["embedding"].strip("[]")  # removes outer []
        else:
            # In case it's already a list of floats
            embedding_str = ','.join(map(str, row["embedding"]))

        embedding_sql = f"ARRAY[{embedding_str}]::vector"

        sql = f"""
            INSERT INTO document_embeddings (document_id, country_code, embedding)
            VALUES (:document_id, :country_code, {embedding_sql})
            ON CONFLICT (document_id) DO NOTHING;
        """

        conn.execute(text(sql), {
            "document_id": row["document_id"],
            "country_code": row["country_code"]
        })

print("✅ All embeddings uploaded successfully.")

ProgrammingError: (psycopg2.errors.SyntaxError) syntax error at or near "-"
LINE 3: ...CCLW.document.i00000004.n0000', 'BIH', ARRAY[ARRAY-0.0395717...
                                                             ^

[SQL: 
            INSERT INTO document_embeddings (document_id, country_code, embedding)
            VALUES (%(document_id)s, %(country_code)s, ARRAY[ARRAY-0.03957175835967064, 0.06338946521282196, 0.028361350297927856, -0.10575797408819199, 0.01713615655899048, -0.016073910519480705, -0.0431273952126503, 0.01931213214993477, 0.09040458500385284, -0.07266988605260849, 0.016095779836177826, 0.08826948702335358, -0.001592898741364479, -0.055431511253118515, 0.008327774703502655, 0.09592875093221664, 0.006416813470423222, 0.08755200356245041, 0.05708659440279007, -0.0314825214445591, -0.11328034102916718, 0.07778917998075485, 0.09363071620464325, 0.07077360153198242, 0.03861020505428314, 0.026525933295488358, -0.008160434663295746, 0.045409973710775375, 0.06601568311452866, -0.01884964480996132, -0.10273757576942444, -0.08917813003063202, 0.07605835795402527, 0.0245533250272274, -0.010040080174803734, 0.07939748466014862, 0.010620202869176865, 0.007267214357852936, -0.06479272246360779, 0.03498169779777527, -0.03534463047981262, 0.02750881388783455, 0.03476047143340111, -0.01816025748848915, 0.08508478105068207, 0.06974437087774277, 0.07848687469959259, 0.018769096583127975, -0.0076122283935546875, -0.05208718776702881, -0.01594211906194687, 0.07711665332317352, 0.037863701581954956, -0.012768261134624481, -0.0409088097512722, 0.07133087515830994, 0.005411561578512192, 0.11100782454013824, 0.018701892346143723, -0.018510274589061737, -0.07646166533231735, -0.19356754422187805, -0.0336117185652256, 0.0021980255842208862, 0.035359807312488556, -0.04568193480372429, 0.0500054731965065, 0.030020780861377716, 0.010037560015916824, 0.024419676512479782, -0.02478320151567459, -0.027414916083216667, 0.0396273136138916, -0.0721483826637268, 0.006707243621349335, 0.0408756360411644, 0.005860380828380585, 0.9381948709487915, -0.0519334077835083, 0.07862873375415802, 0.04265187680721283, -0.09060950577259064, 0.3947744071483612, 0.09206067025661469, 0.014317236840724945, -0.06035977229475975, 0.06498116254806519, 0.048997532576322556, -0.017111830413341522, 0.03183643892407417, -0.0002550855278968811, 0.032208096235990524, -0.073503278195858, -0.01610301062464714, 0.01976410485804081, 0.03444892168045044, 0.01358981803059578, 0.05545547604560852, 0.023494940251111984, -0.0636565238237381, 0.04208434745669365, -0.060725197196006775, 0.03290194645524025, 0.06866736710071564, 0.005713760852813721, -0.02618156373500824, 0.04668118804693222, -0.04189913347363472, 0.05432543158531189, 0.04623384401202202, 0.00837107002735138, 0.055428843945264816, 0.06566374003887177, 0.028812553733587265, 0.060684338212013245, 0.01802719198167324, -0.06380288302898407, 0.03342659771442413, 0.1877727210521698, 0.0644659772515297, 0.08499184250831604, 0.09022685885429382, 0.05631563067436218, 0.006324291229248047, -0.06725937873125076, -0.0017078295350074768, -0.061463385820388794, 0.14378635585308075, -0.03707684203982353, -0.03580338507890701, 0.04241500794887543, -0.1317022293806076, 0.0906328409910202, 0.12080784142017365, 0.08145276457071304, 0.018607329577207565, 0.02535601705312729, -0.004765741527080536, 0.06346625089645386, 0.0647466704249382, 0.08209484070539474, -0.06650024652481079, 0.006017673760652542, -0.015025626868009567, 0.0973689928650856, 0.09164291620254517, -0.03977186605334282, -0.07211756706237793, -0.009448304772377014, -0.003253220347687602, 0.08249394595623016, -0.05990321561694145, -0.013864221051335335, -0.01646725833415985, -0.09965524822473526, 0.6349496245384216, 0.14885911345481873, -0.00846610963344574, -0.00474170595407486, 0.015021875500679016, 0.14330926537513733, 0.0196889229118824, 0.0055047329515218735, 0.013526074588298798, 0.027032136917114258, -0.008236546069383621, 0.02328966185450554, -0.006161842495203018, 0.030984897166490555, 0.0018306449055671692, 0.008810242637991905, 0.04211830347776413, -0.0944255143404007, 0.003260992467403412, -0.019277825951576233, 0.00796167179942131, 0.11685483157634735, -0.008612588047981262, -0.05491459369659424, 0.05529772862792015, -0.056134164333343506, 0.04465550556778908, -0.09087717533111572, 0.04059602692723274, -0.018167797476053238, -0.003004811704158783, -0.026568666100502014, 0.0228091012686491, 0.11004513502120972, 0.04261970520019531, 0.05435376241803169, -0.05884398892521858, 0.010817542672157288, 0.023300915956497192, -0.02199804037809372, 0.08745762705802917, -0.02612055093050003, -0.012252803891897202, 0.07665475457906723, 0.05630671977996826, -0.00869525596499443, -0.09492900967597961, 0.12237302958965302, -0.06929217278957367, 0.09516408294439316, 0.05638366937637329, -0.06117621809244156, 0.1375335156917572, -0.009560022503137589, -0.07113591581583023, -0.009812746196985245, 0.11152279376983643, 0.011607436463236809, 0.03347545489668846, 0.04289500042796135, -0.09001660346984863, 0.007114045321941376, 0.21338559687137604, 0.06233808025717735, 0.009754229336977005, 0.12603969871997833, 0.022849995642900467, -0.020478084683418274, 0.07118260860443115, 0.014592364430427551, 0.06710486114025116, 0.056377165019512177, 0.02920987457036972, 0.03419572487473488, 0.07435912638902664, -0.014294654130935669, -0.008036717772483826, -0.04944053664803505, 0.05906623601913452, 0.08459679037332535, -0.07761384546756744, -0.02822679653763771, 0.02288772538304329, -0.03655896335840225, -0.05415184423327446, -0.06546288728713989, 0.10750961303710938, 0.06791126728057861, 0.16194114089012146, 0.07065458595752716, 0.09624715149402618, 0.031461190432310104, 0.10624342411756516, 0.010582514107227325, 0.0606272853910923, 0.0019421037286520004, -0.03746689856052399, 0.08718664944171906, 0.002957519143819809, -0.0659080445766449, 0.0171841848641634, -0.06151130795478821, 0.013554129749536514, -0.0006283791735768318, -0.04995117336511612, 0.03573513403534889, -0.024263059720396996, 0.016900036484003067, 0.019153054803609848, -0.06950683146715164, -0.06266938149929047, -0.0757206529378891, 0.07078728079795837, 0.06141451373696327, -0.04612138494849205, 0.08813969045877457, -0.04774877056479454, 0.045716866850852966, -0.02440350502729416, 0.05067431926727295, 0.11102542281150818, 0.07198499143123627, 0.057745419442653656, 0.042383234947919846, -0.028417423367500305, -0.045622993260622025, -0.00020670145750045776, -0.1018238291144371, 0.09008244425058365, 0.025214146822690964, -0.032327115535736084, -0.007198397070169449, 0.030533842742443085, 0.04046804457902908, 0.0018157362937927246, 0.01493581011891365, 0.02554469183087349, -0.0674794614315033, 0.09423410892486572, -0.02225782535970211, -0.004341784864664078, 0.06814546883106232, -0.006032079458236694, -0.06067752093076706, -0.06936287879943848, -0.08534123748540878, -0.0904577448964119, -0.049484800547361374, -0.10174713283777237, 0.02513793110847473, -0.0058412738144397736, 0.010134276002645493, -0.01939595490694046, 0.041235096752643585, 0.07740148901939392, -0.11920865625143051, -0.07192986458539963, -0.006640871986746788, 0.09157858788967133, -0.03381969779729843, 0.03557167202234268, -0.015505265444517136, -0.03490489348769188, 0.08518590033054352, 0.03198358416557312, 0.016719412058591843, 0.05668283998966217, 0.06332175433635712, 0.03563288226723671, 0.04877549409866333, 0.13342009484767914, 0.08641549944877625, 0.07693961262702942, 0.023368943482637405, 0.530616283416748, -0.32494986057281494, -0.010798897594213486, 0.07356926798820496, 0.03048718348145485, 0.11398075520992279, 0.026948150247335434, 0.05998695641756058, 0.05995318666100502, 0.15932103991508484, 0.12414237856864929, -0.02633345127105713, -0.031076140701770782, -0.029038511216640472, 0.05623072385787964, 0.08156374096870422, 0.02224394865334034, 0.09539063274860382, 0.017712587490677834, -0.03299418464303017, 0.008230798877775669, 0.04969315603375435, 0.10082488507032394, 0.0002742372453212738, -0.016291411593556404, 0.024019695818424225, 0.06962649524211884, 0.04770928993821144, 0.0533561035990715, 0.04549004137516022, -0.05120089277625084, 0.05479113385081291, -0.031982917338609695, 0.04257182776927948, 0.006288901902735233, 0.10047949850559235, -0.037513431161642075, -0.01964637264609337, 0.0681658536195755, 0.05486041307449341, 0.04773133993148804, 0.04960736632347107, 0.022901836782693863, 0.06009671092033386, 0.028634333983063698, 0.05384881794452667, 0.015500679612159729, 0.02383986860513687, 0.044513039290905, -0.041236668825149536, 0.21220427751541138, 0.03225693106651306, 0.059554070234298706, -0.04055684059858322, 0.0035461299121379852, 0.174188032746315, 0.026010222733020782, 0.07735083997249603, -0.016869492828845978, 0.05575154721736908, -0.01968234032392502, 0.04968946427106857, 0.0036408454179763794, -0.03206793591380119, -0.03648626059293747, 0.08995530009269714, 0.05756811425089836, -0.05626624822616577, -0.07047021389007568, 0.003099631518125534, -0.06167234480381012, -0.017919234931468964, 0.02293502911925316, -0.0687122493982315, 0.03194352611899376, 0.08243967592716217, 0.07796937227249146, -0.06206688657402992, -0.044605400413274765, -0.0045350342988967896, -0.09029309451580048, 0.057078853249549866, 0.08350073546171188, 0.08581896871328354, -0.012010190635919571, 0.06348671019077301, -0.02005504071712494, -0.02936524711549282, 0.01960677281022072, -0.0038895420730113983, 0.0012574493885040283, 0.00425112247467041, 0.07101040333509445, 0.09849069267511368, -0.007175754755735397, -0.002396378666162491, -0.04840308800339699, 0.10975361615419388, -0.15741467475891113, -0.0441756471991539, -0.06122719496488571, 0.0016593486070632935, 0.06147242709994316, -0.10257621854543686, 0.026873257011175156, 0.05989387258887291, -0.025717608630657196, 0.030667513608932495, -0.022213008254766464, -0.0843997374176979, -0.06348282098770142, 0.041214436292648315, -0.045243725180625916, -0.008553430438041687, 0.1831098198890686, 0.02254212275147438, 0.02263501100242138, 0.1001041978597641, -0.010846827179193497, 0.050408847630023956, 0.037127476185560226, -0.06080145761370659, 0.06607002019882202, -0.001368863508105278, -0.24698707461357117, 0.10473926365375519, 0.19825634360313416, 0.05586211755871773, 0.07117919623851776, -0.10085460543632507, 0.017754800617694855, 0.02760046347975731, 0.17601919174194336, 0.006336864084005356, 0.00705583393573761, 0.017731554806232452, -0.0030527263879776, -0.0036849789321422577, 0.043964896351099014, 0.03862587362527847, 0.030167197808623314, 0.09248398244380951, 0.10776790976524353, -0.019394485279917717, -0.03224978595972061, -0.011558767408132553, 0.03637230768799782, -0.03499908745288849, -0.022624652832746506, 0.04547920823097229, -0.02355216071009636, -0.08669573068618774, 0.06563198566436768, 0.06826318055391312, 0.057934798300266266, 0.010228349827229977, -0.005859449505805969, 0.02312515303492546, 0.055970609188079834, 0.00992562621831894, -0.0014027506113052368, -0.10397161543369293, -0.022814728319644928, 0.11759190261363983, 0.13100621104240417, 0.5008354783058167, 0.1011701375246048, 0.3724170923233032, -0.014473605901002884, 0.07995954155921936, 0.00291481614112854, 0.040398359298706055, -0.03385539725422859, -0.000762108713388443, -0.05502067133784294, 0.024903440847992897, 0.12011533975601196, -0.12237170338630676, -0.05596155673265457, 0.11269714683294296, -0.0018630214035511017, -0.023201899603009224, -0.023302927613258362, 0.22692082822322845, 0.008876752108335495, 0.012258660048246384, -0.0036928877234458923, 0.12920986115932465, -0.053224049508571625, -0.04997982829809189, 0.053935423493385315, 0.03535677120089531, -0.0162096805870533, -0.016349326819181442, -0.02638918161392212, -0.03474506363272667, -0.020037977024912834, -0.0027733519673347473, 0.004120800644159317, 0.06458060443401337, -0.03201769292354584, -0.02657010778784752, -0.08473619073629379, 0.11455222964286804, 0.040673740208148956, 0.02296760305762291, 0.1033797413110733, 0.028986722230911255, -0.00218125618994236, -0.02179758995771408, -0.005173426121473312, -0.0508178174495697, 0.0748014822602272, 0.0710773766040802, 0.03862875699996948, 0.1360073983669281, 0.05922914668917656, -0.004011578857898712, -0.035407401621341705, 0.0953054130077362, 0.1072535514831543, 0.013868693262338638, -1.0404859781265259, 0.008572179824113846, 0.12625358998775482, -0.023907754570245743, 0.04206971079111099, 0.06515859067440033, 0.03463897481560707, -0.07333546876907349, 0.06661748886108398, 0.009866029024124146, 0.0840582475066185, -0.013203531503677368, 0.1023084968328476, -0.1014629602432251, 0.03071758709847927, 0.12163759768009186, -0.008407138288021088, -0.02470521256327629, 0.03376907482743263, -0.21764366328716278, 0.03834023326635361, 0.05393892526626587, 0.13364562392234802, -0.019977569580078125, 0.08102593570947647, 0.0785892978310585, -0.011250939220190048, 0.042702145874500275, 0.10782895237207413, -0.01920948550105095, 0.09689150750637054, 0.011797018349170685, -0.04092412441968918, 0.042842961847782135, 0.09228263795375824, 0.10148723423480988, 0.07021410018205643, 10.838075637817383, -0.03633015975356102, -0.0063912831246852875, 0.01366189494729042, 0.06772810220718384, -0.10232272744178772, 0.08365140855312347, -0.04652111977338791, 0.06340117752552032, 0.08948367089033127, -0.016687743365764618, -0.013631682842969894, 0.01786808855831623, -0.0017002522945404053, 0.0025266259908676147, -0.0084700807929039, 0.016989249736070633, -0.019615959376096725, 0.027798522263765335, -0.005369976162910461, -0.07135835289955139, 0.028503596782684326, 0.11511203646659851, -0.057109128683805466, -0.02424616925418377, 0.1532299816608429, 0.12073501199483871, -0.0077199786901474, -0.05797984078526497, 0.02074747532606125, 0.06074333190917969, 0.049487750977277756, 0.06330239772796631, 0.038706857711076736, 0.1045287549495697, -0.014578020200133324, -0.028517086058855057, 0.09507513791322708, 0.019363094121217728, 0.011663567274808884, 0.10847269743680954, 0.015901077538728714, 0.07749252766370773, 0.10642430186271667, 0.0285620279610157, 0.04083096981048584, 0.12117013335227966, 0.09223520755767822, 0.05740297958254814, 0.019477155059576035, 0.03634848818182945, -0.004934780299663544, 0.1362578421831131, 0.013179594650864601, -0.037638403475284576, -0.07164473831653595, 0.030592933297157288, -0.013279743492603302, 0.07429824769496918, 0.02461077645421028, -0.0003330595791339874, 0.09098503738641739, 0.03409966081380844, 0.13036203384399414, 0.047144122421741486, 0.05794946849346161, 0.09170106053352356, 0.106241375207901, -0.06811761856079102, -0.020869113504886627, 0.09878043085336685, -0.13613633811473846, 0.028444770723581314, -0.03426199406385422, -0.05079783499240875, 0.021771714091300964, 0.025238826870918274, 0.018371932208538055, -0.008234899491071701, -0.038696229457855225, 0.11967597901821136, 0.018246226012706757, -0.07709024101495743, -0.01842588558793068, 0.0697682648897171, 0.09480146318674088, -0.017784863710403442, 0.07524827122688293, 0.02816363424062729, -0.05999119207262993, 0.0686882957816124, -0.0449671670794487, -0.005431480705738068, -0.004616670310497284, 0.042956024408340454, -0.08259984105825424, 0.02837383933365345, -0.07588709890842438, 0.032089702785015106, 0.06417842209339142, 0.0035163648426532745, 0.04907029867172241, 0.05775686353445053, 0.12000459432601929, 0.023516077548265457, 0.03482275456190109, 0.0012039989233016968, 0.019557636231184006, 0.029281089082360268, 0.0834575891494751, 0.0021601635962724686, -0.005618158727884293, 0.0411161445081234, -0.11430571973323822, 0.0431954525411129, 0.013372255489230156, -0.03515911102294922, -0.037089671939611435, 0.06296312808990479, 0.091233029961586, -0.008356988430023193, -0.029102269560098648, -0.04753977060317993, 0.09020531177520752, 0.07632946223020554, -0.010097332298755646, -0.07338481396436691, 0.011153299361467361, -0.053036585450172424, -0.04483472928404808, -0.001207347959280014, 0.0785905048251152, 0.11368201673030853, -0.044870030134916306, 0.10633157938718796, 0.006839364767074585, -0.013609826564788818, 0.05563489347696304, 0.04543202370405197, 0.10772936046123505, -0.027662035077810287, 0.046452075242996216, 0.08127939701080322, -0.06132978945970535, 0.010663159191608429, 0.08945038914680481, -0.0028303563594818115, -0.01641407236456871, 0.10206350684165955, -0.03911895304918289, 0.012275174260139465, 0.054434649646282196, 0.0032044146209955215, 0.024296000599861145, 0.0010965950787067413, -0.06606739014387131, -0.07169681787490845, 0.1140751764178276, -0.005438387393951416, 0.011046167463064194, -0.012322865426540375, 0.021174971014261246, 0.10653138160705566, -0.0006009526550769806, -0.025326520204544067, 0.027917031198740005, -0.025899354368448257, 0.025248803198337555, 0.11777587234973907, -0.02980343997478485, 0.00048717111349105835, 0.001087278127670288, -0.1232876405119896, -0.0847553163766861, 0.009478297084569931, 0.06675538420677185, 0.08640553802251816, -0.21355432271957397, -0.07074439525604248, -0.013479523360729218]::vector)
            ON CONFLICT (document_id) DO NOTHING;
        ]
[parameters: {'document_id': 'CCLW.document.i00000004.n0000', 'country_code': 'BIH'}]
(Background on this error at: https://sqlalche.me/e/20/f405)