In [2]:
import json
from typing import List

# unstructured
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# langchain
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from pinecone import ServerlessSpec

import os

# load .env
from dotenv import load_dotenv

# all initializations ----

# env
load_dotenv()


# api keys
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# embedding
embeddings=OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=OPENAI_API_KEY
    )

# pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name="universitydb"


In [5]:
def partition_document(file_path: str):
    """Extract elements from PDF using unstructured"""
    print(f"Partitioning document : {file_path}")

    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True
    )

    print(f"Extract {len(elements)} elements")
    return elements

In [6]:
def create_chucks_by_title(elements):
    """Create intelligent chunks using title-based strategy"""
    print(" creating smart chunks...")

    chunks = chunk_by_title(
        elements,
        max_characters=3000,
        new_after_n_chars=2400,
        combine_text_under_n_chars=500
    )

    print(f"created {len(chunks)} chunks")
    return chunks

In [7]:
def text_extract(chunks):
    docs=[]
    for chunk in chunks :
        docs.append(
            Document(
                page_content=chunk.text,
                metadata={
                    "source":chunk.metadata.filename,
                    "original_docs":chunk.metadata.orig_elements,
                    "pages":chunk.metadata.page_number
                }
            )
        )
    
    return docs

In [None]:

def bootstrap_index():
    if not pc.has_index(index_name):
        pc.create_index(
            name=index_name,
            vector_type="dense",
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )

        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(2)

def ingest_documents(documents):
    index = pc.Index(index_name)
    vector_store=PineconeVectorStore.from_documents(
        documents=documents,
        embedding=embeddings,
        index_name=index_name
    )
    return vector_store

In [12]:

file_path="./docs/EJ1172284.pdf"
elements= partition_document(file_path)

# chunking by titile
chunks=create_chucks_by_title(elements)

# extract text and meta data 
langchain_document=text_extract(chunks)

Partitioning document : ./docs/EJ1172284.pdf
Extract 168 elements
 creating smart chunks...
created 20 chunks


In [22]:
# vector and retrival 

# if pc.has_index(index_name):
#     vec = vectorise(langchain_document)
#     index = vec[0]
#     vector_store = vec[1]
# else:
#     index = pc.Index(index_name)
#     vector_store = PineconeVectorStore(embedding=embeddings, index=index)


bootstrap_index()

vector_store=get_vector_store()

ingest_documents(vector_store, langchain_document)



query = "tell me about  Discussion and conclusions "
retriever = vector_store.as_retriever(search_kwargs={"k":3})
retrieved_chunks=retriever.invoke(query)

PineconeApiValueError: Unable to prepare type Header for serialization

In [16]:
retrieved_chunks

[]