# Pinecone

In [1]:
# Indexing
import json
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import uuid
import os
import uuid
import humps
from datetime import datetime
from typing import Dict, List
from pydantic import BaseModel
from scripts.utils import get_firestore_client
from dotenv import load_dotenv
load_dotenv(".env.development")

  from tqdm.autonotebook import tqdm


True

In [2]:
path = "data/asii_2023_sustainability_report_output (1).json"
with open(path, "r", encoding="utf-8") as file:
    data = json.load(file)

In [3]:
docs = []
ids = []
disclosed_gri = {}
for doc in data:
    docs.append(Document(
        page_content=doc['page_content'],
        metadata=doc['metadata']
    ))
    ids.append(doc['id'])

    for indicator in doc['metadata']['disclosed_gri']:
        if indicator in disclosed_gri:
            disclosed_gri[indicator].append(doc['id'])
        else:
            disclosed_gri[indicator] = [doc['id']]

In [6]:
embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("single-sr-test")
vector_store = PineconeVectorStore(index, embedding_function)

vector_store.add_documents(docs, ids=ids)

['04d53cd1-2143-4588-abc9-8725fac9ae12',
 '3631ab5f-af30-4559-a9d0-82f7c8b7bff6',
 'c4e0029d-03e4-493c-a6f5-a2ddddbdc928',
 '83e782ba-d3cf-4819-baab-676350543d86',
 '4a5da069-2022-485c-ad46-aad71cabe209',
 'c2282f53-e5c3-4e80-820c-3e3492107838',
 '8ae80b96-eb13-48fb-9a90-d39437a9b5b0',
 '2f318141-1573-4944-ac9f-eee509947852',
 '5aa3010c-dbf6-4179-a23e-511abc71bb5d',
 '7d465fa9-9d16-402c-bed7-391f965382a5',
 'a14e19b0-a65c-4e7e-a9e8-adea79642308',
 'e8e06b0d-2eeb-4b6c-a79c-3c98ac0ed853',
 'bc051374-4444-4a09-be4f-7771a8a62ff6',
 'ac18dea9-e941-4e66-94ec-53f14209db9b',
 '64c3f225-b791-4d28-8009-c30eb1d03e0b',
 '2677ee20-a569-48b3-8352-f37039915253',
 '18383b8b-ce55-4745-a5be-381581a68e53',
 'e529c9f6-3009-44e7-b7e4-9196b60e7382',
 'dea25bbf-bd98-4fa8-a929-10ee7ae40079',
 '736ddae3-dd88-4ea6-ab7c-8fc9665b6106',
 'e41e5fa6-a224-4513-8d9a-47a229887d6a',
 '6124bc12-b1ba-48ac-a986-dfb9c83f58bb',
 '8096edd2-71eb-485f-aed9-187081296395',
 '424cda46-3b4d-4cb5-b072-37fa15ae116b',
 '849329b3-70a1-

In [7]:
# Inisialisasi Firestore client
fsclient = get_firestore_client()

# Definisikan model untuk Sustainability Report Document
class SRDocument(BaseModel):
    company: str
    year: int
    ticker: str
    disclosed_gri: Dict
    url: str
    cid: str | None = None
    created_at: datetime | None = None
    updated_at: datetime | None = None

# Fungsi untuk membuat dokumen Sustainability Report
def create_sr_doc(item: SRDocument):
    # Generate CID jika tidak ada
    item.cid = str(uuid.uuid4())

    # Pastikan created_at dan updated_at memiliki nilai jika None
    if item.created_at is None:
        item.created_at = datetime.now()
    if item.updated_at is None:
        item.updated_at = datetime.now()

    # Referensi ke koleksi Firestore
    doc_ref = fsclient.collection("sustainabilityReports").document(item.cid)

    # Simpan data ke Firestore
    doc_ref.set(humps.camelize(item.model_dump()))

    return item

In [8]:
item = SRDocument(
    company="PT Astra International Tbk",
    year=2022,
    ticker = "ASII",
    disclosed_gri=disclosed_gri,
    url=""
)

_disclosed_gri = item.disclosed_gri

# Generate CID jika tidak ada
item.cid = str(uuid.uuid4())

# Pastikan created_at dan updated_at memiliki nilai jika None
if item.created_at is None:
    item.created_at = datetime.now()
if item.updated_at is None:
    item.updated_at = datetime.now()

item_dump = humps.camelize(item.model_dump())
item_dump['disclosedGri'] = _disclosed_gri

# # Referensi ke koleksi Firestore
doc_ref = fsclient.collection("sustainabilityReports").document(item.cid)

# # Simpan data ke Firestore
doc_ref.set(item_dump)

item

SRDocument(company='PT Astra International Tbk', year=2022, ticker='ASII', disclosed_gri={'2-22': ['2f318141-1573-4944-ac9f-eee509947852', 'ac18dea9-e941-4e66-94ec-53f14209db9b', 'cd59191c-7ba3-491e-860f-89121296d0c1'], '2-6': ['619cc14e-140c-4948-8948-29070f634fde', 'c944da77-d69f-4203-ae33-a7ec49a1c53e', '8eb7ede6-6662-47b4-899c-92eb5e950194', 'a2a06528-6c1c-43cf-857b-6f0a90dc6cae', '450c694b-2070-477a-bcf2-05c94e1b4e31'], '2-1': ['619cc14e-140c-4948-8948-29070f634fde', 'c944da77-d69f-4203-ae33-a7ec49a1c53e', '450c694b-2070-477a-bcf2-05c94e1b4e31'], '2-7': ['c944da77-d69f-4203-ae33-a7ec49a1c53e', 'b0a1ffbc-da16-4bb1-9b40-6490ae8034ff', '450c694b-2070-477a-bcf2-05c94e1b4e31'], '2-28': ['0486d227-1eb5-479c-9a0a-3991888a8dfa', '6cc0fe5a-c99e-462b-b69e-ed0d972410c9'], '2-24': ['9f38d6d7-f3b4-407c-8fc8-81555014c56c', '349aaa09-52aa-48d1-a825-b0a43fa4cf8e', 'e0c2f69d-dfc8-45d9-aeda-38c30ab5a828', 'b24ab942-56ff-48c3-83e9-58798ddc5958', '008cb1df-0bd6-4a86-bed1-9b85f2cac67a', '245f464d-9395

In [1]:
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()

True

In [32]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings, HuggingFaceEmbeddings

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=os.getenv("HUGGINGFACEHUB_API_TOKEN"), model_name="climatebert/distilroberta-base-climate-f"
)
# embeddings = HuggingFaceEmbeddings(model_name="climatebert/distilroberta-base-climate-f")

In [7]:
embed = embeddings.embed_query("pada tahun 2023")

In [26]:
import numpy as np

embed_array = np.array(embed)
print(embed_array.shape)

(1, 143, 768)


In [27]:
average_embedding = np.mean(embed_array, axis=(0, 1))
print(average_embedding.shape)

(768,)
