# Pinecone

In [1]:
# Indexing
import json
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import uuid
import os
import uuid
import humps
from datetime import datetime
from typing import Dict, List
from pydantic import BaseModel
from scripts.utils import get_firestore_client
from dotenv import load_dotenv
load_dotenv(".env.development")

  from tqdm.autonotebook import tqdm


True

In [2]:
path = "data/asii_2023_sustainability_report_output.json"
with open(path, "r", encoding="utf-8") as file:
    data = json.load(file)

In [3]:
docs = []
ids = []
disclosed_gri = {}
for doc in data:
    docs.append(Document(
        page_content=doc['page_content'],
        metadata=doc['metadata']
    ))
    ids.append(doc['id'])

    for indicator in doc['metadata']['disclosed_gri']:
        if indicator in disclosed_gri:
            disclosed_gri[indicator].append(doc['id'])
        else:
            disclosed_gri[indicator] = [doc['id']]

In [5]:
disclosed_gri

{'2-22': ['e6070f87-0545-4553-8afa-16a13022f6d3']}

In [6]:
embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("single-sr-test")
vector_store = PineconeVectorStore(index, embedding_function)

vector_store.add_documents(docs, ids=ids)

['873e45d6-de45-45f0-a840-9508ed9fc451',
 '996902e6-7b30-437c-8cff-6e09b5990fc5',
 '57cada26-9494-4737-ba8e-c93b988d0aa7',
 '40daa66e-a1b1-4654-acd1-bc85abd66388',
 '1a31516f-8875-45b7-a9ae-6cab6cb378a3',
 'cf53443e-8ea5-4d16-b2ca-861c25f24b4a',
 '2f14a853-6824-4ef2-a9a4-e13def71cd21',
 'e6070f87-0545-4553-8afa-16a13022f6d3',
 '17086faa-132e-4534-8021-ce3ac4de9ece',
 'c6a390a8-240e-43cf-aca7-e4d29a00352c']

In [21]:
# Inisialisasi Firestore client
fsclient = get_firestore_client()

# Definisikan model untuk Sustainability Report Document
class SRDocument(BaseModel):
    company: str
    year: int
    ticker: str
    disclosed_gri: Dict
    url: str
    cid: str | None = None
    created_at: datetime | None = None
    updated_at: datetime | None = None

# Fungsi untuk membuat dokumen Sustainability Report
def create_sr_doc(item: SRDocument):
    # Generate CID jika tidak ada
    item.cid = str(uuid.uuid4())

    # Pastikan created_at dan updated_at memiliki nilai jika None
    if item.created_at is None:
        item.created_at = datetime.now()
    if item.updated_at is None:
        item.updated_at = datetime.now()

    # Referensi ke koleksi Firestore
    doc_ref = fsclient.collection("sustainabilityReports").document(item.cid)

    # Simpan data ke Firestore
    doc_ref.set(humps.camelize(item.model_dump()))

    return item

sr_doc = SRDocument(
    company="PT Astra International Tbk",
    year=2022,
    ticker = "ASII",
    disclosed_gri=disclosed_gri,
    url=""
)

In [23]:
item = SRDocument(
    company="PT Astra International Tbk",
    year=2022,
    ticker = "ASII",
    disclosed_gri=disclosed_gri,
    url=""
)

_disclosed_gri = item.disclosed_gri

# Generate CID jika tidak ada
item.cid = str(uuid.uuid4())

# Pastikan created_at dan updated_at memiliki nilai jika None
if item.created_at is None:
    item.created_at = datetime.now()
if item.updated_at is None:
    item.updated_at = datetime.now()

item_dump = humps.camelize(item.model_dump())
item_dump['disclosedGri'] = _disclosed_gri

# # Referensi ke koleksi Firestore
doc_ref = fsclient.collection("sustainabilityReports").document(item.cid)

# # Simpan data ke Firestore
doc_ref.set(item_dump)

item

SRDocument(company='PT Astra International Tbk', year=2022, ticker='ASII', disclosed_gri={'2-22': ['e6070f87-0545-4553-8afa-16a13022f6d3']}, url='', cid='811ccd63-6899-4fbf-99b1-90b7887a9c1d', created_at=datetime.datetime(2025, 1, 9, 20, 0, 35, 214302), updated_at=datetime.datetime(2025, 1, 9, 20, 0, 35, 214302))

# Firestore