In [None]:
!pip install pypdfium2
!pip install langchain-community langchain
!pip install qdrant-client fastembed
!pip install langchain-sambanova
#!pip install 'markitdown[pdf, docx]'

In [2]:
from markitdown import MarkItDown

In [19]:
md = MarkItDown(enable_plugins=False)

In [None]:
result = md.convert("/content/policy_framework_legal_counsel.pdf")

In [None]:
print(result.text_content)

In [None]:
doc2 = md.convert("/content/NEP_Final_English_0.pdf")

In [None]:
print(doc2.text_content)

## pypdfium

In [3]:
from langchain_community.document_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import PyPDFium2Parser

In [8]:
loader = GenericLoader(
    blob_loader=FileSystemBlobLoader(
        path="data",
        glob="*.pdf",
    ),
    blob_parser=PyPDFium2Parser(),
)

In [None]:
docs = loader.load()

In [10]:
len(docs)

85

In [11]:
print(docs[2].metadata)

{'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2020-07-30T20:22:15+05:30', 'title': '', 'author': 'HP', 'subject': '', 'keywords': '', 'moddate': '2020-07-30T20:22:15+05:30', 'source': 'data/NEP_Final_English_0.pdf', 'total_pages': 66, 'page': 2}


## Custom Metadata

In [2]:
import os
from google.colab import userdata

from langchain_sambanova import ChatSambaNovaCloud
from langchain.schema import HumanMessage
import json
from pydantic import BaseModel, Field
from typing import List

In [14]:
os.environ['SAMBANOVA_API_KEY'] = userdata.get("SAMBANOVA_API_KEY")

In [15]:
summarizer_llm = ChatSambaNovaCloud(model="Meta-Llama-3.1-8B-Instruct",max_tokens=4096,temperature=0.0)

In [16]:
class FAQ(BaseModel):
    question: str = Field(description="A relevant question about the chunk content")
    answer: str = Field(description="A concise answer based on the chunk content")

class ChunkMetadata(BaseModel):
    faqs: List[FAQ] = Field(description="2-3 relevant FAQ pairs about the chunk content")
    keywords: List[str] = Field(description="5-7 key topics and keywords from the chunk")

In [17]:
def generate_document_summary(full_document_text: str, title: str) -> str:
    prompt = f"""You are expert document summarizer with domain expertise of the policy document and financial institutions
    Analyze this document and provide a concise 5-6 sentence summary focusing on the main purpose, key topics, and scope.
    Keep it diverse and no redundant information.
    Document Title: {title}
    Document Content: {full_document_text[:4000]}...

    No explaination, just generate the summary.
    Summary:
    """
    response = summarizer_llm.invoke([HumanMessage(content=prompt)])
    return response.content.strip()

In [18]:
def generate_chunk_metadata(chunk_text: str, document_title: str) -> dict:
    structured_llm = summarizer_llm.with_structured_output(ChunkMetadata)

    prompt = f"""Based on this document chunk, generate:
      1. 2-3 relevant FAQ pairs that users might ask about this specific content
      2. 5-7 key topics/keywords that represent the main concepts

      Document: {document_title}
      Chunk Content: {chunk_text}
      Focus on creating FAQs that would help users understand this specific section and keywords that capture the essential topics discussed."""

    try:
        result = structured_llm.invoke([HumanMessage(content=prompt)])
        return {
            "faqs": [{"question": faq.question, "answer": faq.answer} for faq in result.faqs],
            "keywords": result.keywords
        }
    except Exception as e:
        print(f"Error generating chunk metadata: {e}")
        return {"faqs": [], "keywords": []}

In [19]:
def custom_metadta(documents):
    document_summaries = {}

    for doc in documents:
        source = doc.metadata['source']
        title = doc.metadata.get('title', 'Unknown Document')

        if source not in document_summaries:
            full_text = ' '.join([d.page_content for d in documents if d.metadata['source'] == source])
            document_summaries[source] = generate_document_summary(full_text, title)

        doc.metadata['document_summary'] = document_summaries[source]

        chunk_metadata = generate_chunk_metadata(doc.page_content, title)
        doc.metadata['chunk_faqs'] = chunk_metadata['faqs']
        doc.metadata['chunk_keywords'] = chunk_metadata['keywords']
        doc.metadata['chunk_id'] = f"{source.split('/')[-1].split('.')[0]}_chunk_{hash(doc.page_content) % 10000}"
        doc.metadata['chunk_length'] = len(doc.page_content)

    return documents

In [None]:
chunks = custom_metadta(docs)

In [None]:
chunks[2].metadata

In [2]:
import pickle

def save_enriched_documents(documents, filepath="enriched_documents.pkl"):
    """Save enriched documents to pickle file"""
    with open(filepath, 'wb') as f:
        pickle.dump(documents, f)
    print(f"Saved {len(documents)} enriched documents to {filepath}")

def load_enriched_documents(filepath="enriched_documents.pkl"):
    """Load enriched documents from pickle file"""
    with open(filepath, 'rb') as f:
        documents = pickle.load(f)
    print(f"Loaded {len(documents)} enriched documents from {filepath}")
    return documents

In [23]:
save_enriched_documents(chunks, "/content/enriched_documents.pkl")

Saved 85 enriched documents to /content/enriched_documents.pkl


In [4]:
chunks = load_enriched_documents("/content/enriched_documents.pkl")

Loaded 85 enriched documents from /content/enriched_documents.pkl


In [3]:
from qdrant_client import QdrantClient,models
from qdrant_client.models import (
    VectorParams, Distance, SparseVectorParams, Modifier,
    BinaryQuantization, BinaryQuantizationConfig,PointStruct
)
from fastembed import TextEmbedding, SparseTextEmbedding
import uuid,gc

In [4]:
client = QdrantClient(
    url = userdata.get("QDRANT_URL"),
    api_key = userdata.get("QDRANT_API_KEY"),
)

In [6]:
collection_name = "casestudy"

In [None]:
dense_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")
sparse_model = SparseTextEmbedding(model_name="Qdrant/BM25")

In [24]:
client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "dense": VectorParams(
            size=512,
            distance=Distance.COSINE,
            on_disk=True
        ),
    },
    sparse_vectors_config={
        "sparse": SparseVectorParams(
            modifier=Modifier.IDF,
        ),
    },
    quantization_config=BinaryQuantization(
        binary=BinaryQuantizationConfig(always_ram=False)
    )
)

True

In [27]:
def index_documents(data, batch_size=32):
    total_docs = len(data)
    successful_docs = 0

    for i, doc in enumerate(data):
        try:
            # Create searchable text
            faq_text = " ".join([f"Q: {faq['question']} A: {faq['answer']}"
                                for faq in doc.metadata.get('chunk_faqs', [])])
            keywords_text = " ".join(doc.metadata.get('chunk_keywords', []))
            searchable_text = f"{doc.page_content} {faq_text} {keywords_text}".strip()

            dense_embedding = next(dense_model.embed([searchable_text]))
            sparse_embedding = next(sparse_model.embed([searchable_text]))

            point = PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "dense": dense_embedding,
                    "sparse": sparse_embedding.as_object(),
                },
                payload={
                    "content": doc.page_content,
                    "source": doc.metadata['source'],
                    "page": doc.metadata.get('page', 0),
                    "document_summary": doc.metadata.get('document_summary', ''),
                    "chunk_faqs": doc.metadata.get('chunk_faqs', []),
                    "chunk_keywords": doc.metadata.get('chunk_keywords', []),
                    "chunk_id": doc.metadata.get('chunk_id', ''),
                }
            )

            client.upsert(
                collection_name=collection_name,
                points=[point],
                wait=True
            )

            successful_docs += 1

            del dense_embedding, sparse_embedding, point, searchable_text

            if i % 10 == 0:
                gc.collect()

        except Exception as e:
            gc.collect()
            continue

In [None]:
index_documents(chunks)

## Inference

In [7]:
query = "National Education Policy 2020"
k = 5

In [8]:
dense_vectors = next(dense_model.embed([query]))
sparse_vectors = next(sparse_model.embed([query]))

In [9]:
prefetch = [
    models.Prefetch(
        query=dense_vectors,
        using="dense",
        limit=10,
    ),
    models.Prefetch(
        query=models.SparseVector(**sparse_vectors.as_object()),
        using="sparse",
        limit=10,
    )]

In [10]:
results = client.query_points(
        collection_name,
        prefetch=prefetch,
        query=dense_vectors,
        using="dense",
        with_payload=True,
        limit=k,
)

In [11]:
len(results.points)

5

In [12]:
context = []

In [13]:
for result in results.points:
      context.append({
          'id': result.id,
          'content': result.payload.get('content', ''),
          'source': result.payload.get('source', ''),
          'page': result.payload.get('page', 0),
          'document_summary': result.payload.get('document_summary', ''),
          'chunk_faqs': result.payload.get('chunk_faqs', []),
          'chunk_keywords': result.payload.get('chunk_keywords', []),
          'chunk_id': result.payload.get('chunk_id', ''),
      })

In [14]:
context[0]['source']

'data/NEP_Final_English_0.pdf'

## Filter conditions

In [15]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="source",
    field_schema=models.PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=90, status=<UpdateStatus.COMPLETED: 'completed'>)

In [16]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="chunk_keywords",
    field_schema=models.PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=92, status=<UpdateStatus.COMPLETED: 'completed'>)

In [17]:
filter_condition = models.Filter(
    must=[
        models.FieldCondition(
            key="chunk_keywords",
            match=models.MatchValue(value="Human rights")
        )
    ]
)

In [18]:
results = client.query_points(
    collection_name,
    prefetch=prefetch,
    query=dense_vectors,
    using="dense",
    query_filter=filter_condition,
    with_payload=True,
    limit=k,
)

In [19]:
results

QueryResponse(points=[ScoredPoint(id='89062e49-db2e-4d1c-bef4-2e904a5c6077', version=6, score=0.8891621, payload={'content': 'National Education Policy 2020\n6\n• a rootedness and pride in India, and its rich, diverse, ancient and modern culture and\nknowledge systems and traditions;\n• education is a public service; access to quality education must be considered a basic right of\nevery child;\n• substantial investment in a strong, vibrant public education system as well as the\nencouragement and facilitation of true philanthropic private and community participation.\nThe Vision of this Policy\nThis National Education Policy envisions an education system rooted in Indian ethos that contributes\ndirectly to transforming India, that is Bharat, sustainably into an equitable and vibrant knowledge\nsociety, by providing high-quality education to all, and thereby making India a global knowledge\nsuperpower. The Policy envisages that the curriculum and pedagogy of our institutions must\ndevel