In [1]:
import json
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
import os
import uuid
import glob
import sys
import time
from dotenv import load_dotenv
load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [2]:
class VectorDB:
    def __init__(self):
        self.index_name = os.getenv("PINECONE_INDEX")
        self.index = Pinecone(api_key=os.getenv("PINECONE_API_KEY")).Index(self.index_name)
        self.embedding = OpenAIEmbeddings(model=os.environ["EMBEDDINGS"], api_key=os.getenv("OPENAI_API_KEY"))
        self.vector_store = PineconeVectorStore(index=self.index, embedding=self.embedding)

    def partition_text(self, text, max_size=40000):
        num_parts = -(-sys.getsizeof(text) // max_size)  # Ceiling division
        part_size = len(text) // num_parts

        return [text[i * part_size:None if i == num_parts - 1 else (i + 1) * part_size] for i in range(num_parts)]

    def add_items(self, json_path: str):
        docs, ids = self._load_docs(json_path)

        batch_size = 100
        for i in range(0, len(docs), batch_size):
            batch_docs = docs[i:i + batch_size]
            batch_ids = ids[i:i + batch_size]
            
            # Menambahkan batch ke vector store
            self.vector_store.add_documents(documents=batch_docs, ids=batch_ids)
        return ids

    def _load_docs(self, json_path: str):
        with open(json_path, 'r') as f:
            data = json.load(f)

        basename = os.path.basename(json_path)[:-5]
        docs, ids = [], []

        for doc in data:
            page_content = doc["page_content"]
            doc_metadata = {
                "page": doc["metadata"]["page"],
                "company": basename
            }

            # Partition content if necessary
            parts = self.partition_text(page_content) if sys.getsizeof(page_content) > 40960 else [page_content]

            for part in parts:
                doc_id = str(uuid.uuid4())
                docs.append(Document(page_content=part, metadata=doc_metadata))
                ids.append(doc_id)

        return docs, ids

In [3]:
# ADJUSTABLE

files = glob.glob("data/JSON/*/*.json")
files = [os.path.normpath(f) for f in files]
len(files)

1

In [4]:
vectordb = VectorDB()

start_total = time.time()
for idx, file_path in enumerate(files, 1):
    file_name = os.path.basename(file_path)
    start_time = time.time()
    
    print(f"[{idx}/{len(files)}] Processing: {file_name}")
    
    # Proses file
    vectordb.add_items(file_path)
    
    # Hitung waktu
    elapsed = time.time() - start_time
    print(f"✅ Selesai dalam {elapsed:.2f} detik\n")

[1/1] Processing: ID_ADRO_AR_2022.json
✅ Selesai dalam 53.68 detik

