In [1]:
from langchain_docling import DoclingLoader
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
from typing import List
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Konstanta
PINECONE_API_KEY = userdata.get("PINECONE_API_KEY")
FILE_PATH = "/content/Tafsir Ibnu Katsir Jilid 1-1.pdf"
INDEX_NAME = "islamifier"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0

# Inisialisasi Pinecone
def initialize_pinecone(api_key: str) -> Pinecone:
    """Inisialisasi koneksi Pinecone."""
    try:
        return Pinecone(api_key=api_key)
    except Exception as e:
        logger.error(f"Gagal menginisialisasi Pinecone: {e}")
        raise

# Inisialisasi Embeddings
def initialize_embeddings() -> HuggingFaceEmbeddings:
    """Inisialisasi model embeddings dari HuggingFace."""
    try:
        return HuggingFaceEmbeddings(
            model_name="nomic-ai/nomic-embed-text-v2-moe",
            model_kwargs={"device": "cuda", "trust_remote_code": True},
            encode_kwargs={"normalize_embeddings": True}
        )
    except Exception as e:
        logger.error(f"Gagal menginisialisasi embeddings: {e}")
        raise

# Memuat dan memproses dokumen
def load_and_split_documents(file_path: str, chunk_size: int, chunk_overlap: int) -> List:
    """Memuat dokumen dan membaginya menjadi potongan."""
    try:
        loader = DoclingLoader(file_path=file_path)
        docs = loader.load()
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len
        )
        return splitter.split_documents(docs)
    except Exception as e:
        logger.error(f"Gagal memuat atau membagi dokumen: {e}")
        raise

# Menyimpan dokumen ke vector store
def store_documents(pc: Pinecone, documents: List, embeddings: HuggingFaceEmbeddings, index_name: str) -> None:
    """Menyimpan dokumen ke Pinecone Vector Store."""
    try:
        vector_store = PineconeVectorStore(pc, index_name=index_name, embedding=embeddings)
        vector_store.add_documents(documents=documents)
        logger.info("Dokumen berhasil disimpan ke vector store!")
    except Exception as e:
        logger.error(f"Gagal menyimpan dokumen ke vector store: {e}")
        raise

# Main execution
def main():
    """Fungsi utama untuk menjalankan proses."""
    try:
        # Inisialisasi komponen
        pc = initialize_pinecone(PINECONE_API_KEY)
        embeddings = initialize_embeddings()
        
        # Proses dokumen
        docs_splitted = load_and_split_documents(FILE_PATH, CHUNK_SIZE, CHUNK_OVERLAP)
        logger.info(f"Jumlah dokumen setelah split: {len(docs_splitted)}")
        
        # Simpan ke vector store
        store_documents(pc, docs_splitted, embeddings, INDEX_NAME)
        
    except Exception as e:
        logger.error(f"Terjadi kesalahan dalam proses utama: {e}")

if __name__ == "__main__":
    main()

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject