In [3]:
from pinecone import Pinecone
import os
import re
from tqdm import tqdm

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("public")

In [None]:
from google.cloud import storage
from google.oauth2 import service_account
import json

# Initialize GCS client
gcp_credentials_info = os.getenv("GCP_SERVICE_ACCOUNT_CREDENTIALS")
if gcp_credentials_info:
    gcp_credentials_info = json.loads(gcp_credentials_info)
    gcp_service_account_credentials = service_account.Credentials.from_service_account_info(gcp_credentials_info)
    storage_client = storage.Client(credentials=gcp_service_account_credentials)
else:
    storage_client = storage.Client()

# Fetch all JSON files from the articoli_nt+fisco folder and extract URLs
bucket = storage_client.bucket("loomy-jobs")
# bucket = storage_client.bucket("loomy-public-documents")

In [6]:
# Delete all files whose filename starts with a capital letter
blobs = list(bucket.list_blobs(prefix="nt_fisco/"))
deleted_count = 0

for blob in tqdm(blobs, desc="Deleting files starting with capital letter"):
    filename = blob.name.split("/")[-1]
    if filename and filename[0].isupper():
        blob.delete()
        deleted_count += 1

print(f"Done. Deleted {deleted_count} files.")

Deleting files starting with capital letter: 100%|██████████| 3517/3517 [17:31<00:00,  3.35it/s]

Done. Deleted 3512 files.





In [None]:
# Rename files by removing the trailing ID (e.g., "_5b07a2f9d2dcae34.json" -> ".json")
pattern = re.compile(r'_[a-f0-9]{16}\.json$')

blobs = list(bucket.list_blobs(prefix="articoli_nt+fisco/"))
renamed_count = 0

for blob in tqdm(blobs, desc="Renaming files"):
    if blob.name.endswith(".json"):
        filename = blob.name.split("/")[-1]
        folder = blob.name.split("/")[1]
        if pattern.search(filename):
            new_filename = pattern.sub('.json', filename)
            new_blob_name = f"articoli_nt+fisco/{folder}/{new_filename}"
            
            # Copy to new name and delete old
            bucket.copy_blob(blob, bucket, new_blob_name)
            blob.delete()
            renamed_count += 1

print(f"Done. Renamed {renamed_count} files.")

Renaming files: 100%|██████████| 1768/1768 [08:46<00:00,  3.36it/s] 

Done. Renamed 1767 files.





In [28]:
blobs = bucket.list_blobs(prefix="articoli_nt+fisco/")

# Dict with filename as key and url as value
url_by_filename = {}
for blob in blobs:
    if blob.name.endswith(".json"):
        # Extract filename from the blob path (e.g., "articoli_nt+fisco/somefile.json" -> "somefile.json")
        filename = blob.name.split("/")[-1]
        folder = blob.name.split("/")[1]
        content = blob.download_as_text()
        data = json.loads(content)
        if "url" in data:
            url_by_filename[filename] = data["url"]

print(f"Found {len(url_by_filename)} URLs from GCS bucket")

Found 1760 URLs from GCS bucket


In [5]:
# Create .txt files from JSON files in nt_fisco folder
blobs = list(bucket.list_blobs(prefix="nt_fisco/"))
created_count = 0

for blob in tqdm(blobs, desc="Creating txt files"):
    if blob.name.endswith(".json"):
        content = blob.download_as_text()
        data = json.loads(content)
        
        title = data.get("title", "")
        preview = data.get("preview", "")
        
        # Create txt content
        txt_content = f"""{title}

{preview}
"""
        
        # Get filename and create new blob in txt folder
        filename = blob.name.split("/")[-1].replace(".json", ".txt")
        txt_blob_name = f"nt_fisco/txt/{filename}"
        txt_blob = bucket.blob(txt_blob_name)
        txt_blob.upload_from_string(txt_content, content_type="text/plain")
        created_count += 1

print(f"Done. Created {created_count} txt files.")

Creating txt files: 100%|██████████| 1762/1762 [05:26<00:00,  5.39it/s]

Done. Created 1761 txt files.





Job cost estimation

In [6]:
# Read all the txt files in the nt_fisco/txt folder and estimate the tokens
blobs = list(bucket.list_blobs(prefix="nt_fisco/txt/"))
total_tokens = 0
for blob in tqdm(blobs, desc="Estimating tokens"):
    if blob.name.endswith(".txt"):
        content = blob.download_as_text()
        # Simple token estimation: 1 token = 0.75 words
        estimated_tokens = int(len(content.split()) * 0.75)
        total_tokens += estimated_tokens
total_tokens

Estimating tokens: 100%|██████████| 1759/1759 [02:34<00:00, 11.41it/s]
Estimating tokens: 100%|██████████| 1759/1759 [02:34<00:00, 11.41it/s]


39084

In [7]:
# Nova micro eu-central-1
input_price = 0.000046
output_price = 0.000184
total_cost = (total_tokens / 1000) * input_price + (total_tokens / 1000) * output_price
total_cost

0.00898932

I need to embed before

In [None]:
page_token = None
updated_count = 0
skipped_count = 0

while True:
    # fetch a page
    res = index.query(
        vector=[0] * 1536,                 # dummy vector
        filter={"source": "nt_fisco"},
        top_k=500,                         # page size
        include_metadata=True,
        include_values=False,
        next_page_token=page_token
    )

    matches = res.get("matches", [])
    if not matches:
        break

    updates = []
    for match in matches:
        doc_name = match["metadata"].get("doc_name")
        if doc_name and doc_name in url_by_filename:
            updates.append({
                "id": match["id"],
                "metadata": {
                    **match["metadata"],
                    "storage_path": url_by_filename[doc_name]
                }
            })
            updated_count += 1
        else:
            skipped_count += 1

    # push updates to Pinecone
    if updates:
        index.upsert(vectors=updates)

    # move to next page
    page_token = res.get("next_page_token")
    if not page_token:
        break

print(f"Done. Updated: {updated_count} vectors. Skipped: {skipped_count} (no matching URL).")