In [119]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [120]:
from langchain.document_loaders import PDFPlumberLoader

# loader = PDFPlumberLoader("../docs/Multimodal Retrieval.pdf")
loader = PDFPlumberLoader("../docs/Multimodal Retrieval.pdf")
docs = loader.load()

In [121]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = splitter.split_documents(docs)

In [122]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"
)

In [123]:
from openai import OpenAI
import os
key = os.environ["OPENAI_API_KEY"]

client = OpenAI()

In [124]:
import numpy as np
def get_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large"
    )
    embeddings = np.array(response.data[0].embedding,dtype=np.float32)
    return embeddings

In [125]:
import logfire
import faiss
from typing import List
index = faiss.IndexFlatIP(3072)
documents:List[dict] = []
print(f"Split the document into {len(chunks)} chunks")
embedddings_list =[]
for i,chunk in enumerate(chunks):
        response = get_embeddings(chunk.page_content)
        documents.append({
            "text":chunk,
            "filename":"basic-text.pdf",
            "chunk_index":i,
                }
        )
        embedddings_list.append(response)
    
embedddings_matrix = np.vstack(embedddings_list).astype(np.float32)
index.add(embedddings_matrix)



Split the document into 6 chunks


In [126]:

def _retrive(question:str,top_k:int=3)->List[dict]:
    if index.ntotal==0:
      return []
    with logfire.span("retrival of the chunks ",question=question):
      question_embedding =get_embeddings(question)
      query_vector = question_embedding.reshape(1,-1)
      k = min(top_k,index.ntotal)
      distances,indices = index.search(query_vector,k)
      logfire.info("FAISS Search completed")
      top_chunks = []
      for i,idx in enumerate(indices[0]) :
        if idx<len(documents) and idx >=0:
          chunk = documents[idx].copy()
          chunk['similarity_score'] = float(distances[0][i])
          top_chunks.append(chunk)
      logfire.info("retivesd")
      return top_chunks

In [127]:
def _split_text(self,text:str)->List[str]:
    chunks = []
    start = 0
    while start <len(text):
      end = start + self.chunk_size
      chunk = text[start:end]
      if end < len(text):
        last_period = chunk.rfind(".")
        if last_period > self.chunk_size//2:
          chunk = chunk[:last_period+1]
          end = start+last_period+1
      chunks.append(chunk.strip())
      start = end-self.chunk_overlap
    return [c for c in chunks if c]

In [None]:
from typing import Tuple,List
def ask(question:str)->Tuple[str,List[str]]:
    with logfire.span("rag_ask",question=question):
      relevant_chunks = _retrive(question)
      if not relevant_chunks:
        return (

            "dont have relevant chunks"
            "please upload another document",
            []

        )
      # print(relevant_chunks)
    context = "\n\n".join([chunk for chunk in relevant_chunks])

    system_propmt = """you are a helpful assistant that answer question based in the provided contextt,
    Rules:
    1.only use information form the provided context
    2.If the context doenst contain the answer say so
    3.Be consise and accurate
    4.cite which document the information came from"""
    user_prompt =f""" Context:{context} Question:{question}
    please answer the question based on the context above"""
    logfire.info("Generating the answer with openai")
    response =client.chat.completions.create(
        model="gpt-4o-min",
        messages=[
            {"role":"system","content":system_propmt},
            {"role":"user","content":user_prompt}
        ]
    )
    answer = response.choices[0].message.content
    sources = list(set(chunk["filename"] for chunk in relevant_chunks))
    logfire.info("Answer generated")
    return answer,sources

In [131]:
answer, _ = ask("What is Multimodal Retrival?")
answer

[{'text': Document(metadata={'source': '../docs/Multimodal Retrieval.pdf', 'file_path': '../docs/Multimodal Retrieval.pdf', 'page': 0, 'total_pages': 6, 'Author': '', 'CreationDate': 'D:20251130052107Z', 'Creator': 'Nebo', 'ModDate': 'D:20251130052107Z', 'Producer': 'MyScript interactive ink', 'Title': 'Multimodal Retrieval'}, page_content='Custom Training of CLIP.\nEncoder Tent Encooler\nImage\n① LSTM.\n① Remit.\n•\nI'), 'filename': 'basic-text.pdf', 'chunk_index': 0, 'similarity_score': 0.22641685605049133}, {'text': Document(metadata={'source': '../docs/Multimodal Retrieval.pdf', 'file_path': '../docs/Multimodal Retrieval.pdf', 'page': 1, 'total_pages': 6, 'Author': '', 'CreationDate': 'D:20251130052107Z', 'Creator': 'Nebo', 'ModDate': 'D:20251130052107Z', 'Producer': 'MyScript interactive ink', 'Title': 'Multimodal Retrieval'}, page_content='☒ I m a g e P u p o u r i n g\n> R u b e I m a g e .\nI m a g e\n( 2 2 4 × 2 2 4 × 3 )\n( h x w x c )\nB l a ck\nA s p e c t R o l o . ( p a d

TypeError: sequence item 0: expected str instance, dict found