In [156]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [157]:
from langchain.document_loaders import PDFPlumberLoader

# loader = PDFPlumberLoader("../docs/Multimodal Retrieval.pdf")
loader = PDFPlumberLoader("../docs/Capstone Requirements - Phase 1 Review.pdf")
docs = loader.load()

In [158]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = splitter.split_documents(docs)

In [159]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"
)

In [160]:
from openai import OpenAI
import os
key = os.environ["OPENAI_API_KEY"]

client = OpenAI()

In [161]:
import numpy as np
def get_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large"
    )
    embeddings = np.array(response.data[0].embedding,dtype=np.float32)
    return embeddings

In [162]:
import logfire
import faiss
from typing import List
index = faiss.IndexFlatIP(3072)
documents:List[dict] = []
print(f"Split the document into {len(chunks)} chunks")
embedddings_list =[]
for i,chunk in enumerate(chunks):
        response = get_embeddings(chunk.page_content)
        documents.append({
            "text":chunk,
            "filename":"Project Proposal_ Multimodal Media Retrieval.pdf",
            "chunk_index":i,
                }
        )
        embedddings_list.append(response)
    
embedddings_matrix = np.vstack(embedddings_list).astype(np.float32)
index.add(embedddings_matrix)



Split the document into 5 chunks


In [163]:

def _retrive(question:str,top_k:int=3)->List[dict]:
    if index.ntotal==0:
      return []
    with logfire.span("retrival of the chunks ",question=question):
      question_embedding =get_embeddings(question)
      query_vector = question_embedding.reshape(1,-1)
      k = min(top_k,index.ntotal)
      distances,indices = index.search(query_vector,k)
      logfire.info("FAISS Search completed")
      top_chunks = []
      for i,idx in enumerate(indices[0]) :
        if idx<len(documents) and idx >=0:
          chunk = documents[idx].copy()
          chunk['similarity_score'] = float(distances[0][i])
          top_chunks.append(chunk)
      logfire.info("retivesd")
      return top_chunks

In [164]:
def _retrive(question: str, top_k: int = 5):
    question_embedding = get_embeddings(question)  # shape: (D,)

    # FAISS expects (1, D)
    query_vector = np.array([question_embedding]).astype("float32")

    k = min(top_k, index.ntotal)

    distances, indices = index.search(query_vector, k)

    top_chunks = []
    for idx in indices[0]:
        if idx != -1:
            top_chunks.append(chunks[idx])

    return top_chunks

In [165]:
def _split_text(self,text:str)->List[str]:
    chunks = []
    start = 0
    while start <len(text):
      end = start + self.chunk_size
      chunk = text[start:end]
      if end < len(text):
        last_period = chunk.rfind(".")
        if last_period > self.chunk_size//2:
          chunk = chunk[:last_period+1]
          end = start+last_period+1
      chunks.append(chunk.strip())
      start = end-self.chunk_overlap
    return [c for c in chunks if c]

In [166]:
from typing import Tuple,List
def ask(question:str)->Tuple[str,List[str]]:
    with logfire.span("rag_ask",question=question):
      relevant_chunks = _retrive(question)
      if not relevant_chunks:
        return (

            "dont have relevant chunks"
            "please upload another document",
            []

        )
      print(relevant_chunks)
    context = "\n\n".join([chunk.page_content for chunk in relevant_chunks])

    system_propmt = """you are a helpful assistant that answer question based in the provided contextt,
    Rules:
    1.only use information form the provided context
    2.If the context doenst contain the answer say so
    3.Be consise and accurate
    4.cite which document the information came from"""
    user_prompt =f""" Context:{context} Question:{question}
    please answer the question based on the context above"""
    logfire.info("Generating the answer with openai")
    response =client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role":"system","content":system_propmt},
            {"role":"user","content":user_prompt}
        ]
    )
    answer = response.choices[0].message.content
    sources = list(set(chunk.page_content for chunk in relevant_chunks))
    logfire.info("Answer generated")
    return answer,sources

In [167]:
answer, _ = ask("What are capstone requirements?")
answer

[Document(metadata={'source': '../docs/Capstone Requirements - Phase 1 Review.pdf', 'file_path': '../docs/Capstone Requirements - Phase 1 Review.pdf', 'page': 0, 'total_pages': 1, 'Title': 'Capstone Requirements', 'Producer': 'macOS Version 26.1 (Build 25B78) Quartz PDFContext', 'Author': 'Seshadri Mazumder', 'Creator': 'Notes', 'CreationDate': "D:20251220022827Z00'00'", 'ModDate': "D:20251220022827Z00'00'"}, page_content='Capstone Requirements\nDear Teams,\nThese are the following requirements we have to showcase:\n1. Pretrained CLIP Inference\n1. Flickr8K\n1. Image2Text - Quantitative & Qualitative both\n2. Text2Image - Quantitative & Qualitative both\n3. Text2Text - Quantitative & Qualitative both\n4. Image2Image - Qualitative only\n2. Flickr30K\n1. Image2Text - Quantitative & Qualitative both\n2. Text2Image - Quantitative & Qualitative both\n3. Text2Text - Quantitative & Qualitative both\n4. Image2Image - Qualitative only'), Document(metadata={'source': '../docs/Capstone Requiremen

'The capstone requirements include the following:\n\n1. **Pretrained CLIP Inference**:\n   - For **Flickr8K**:\n     - Image2Text - Quantitative & Qualitative both\n     - Text2Image - Quantitative & Qualitative both\n     - Text2Text - Quantitative & Qualitative both\n     - Image2Image - Qualitative only\n   - For **Flickr30K**:\n     - Image2Text - Quantitative & Qualitative both\n     - Text2Image - Quantitative & Qualitative both\n     - Text2Text - Quantitative & Qualitative both\n     - Image2Image - Qualitative only\n   - Training & Testing/Validation Loss Per Epoch - Important (IMP)\n\n2. **Model Designs**:\n   - Various Models with different designs specifying the image and text encoders.\n\n3. **CLIP Trained from Scratch**:\n   - Requirements similar to those for Pretrained CLIP Inference, specifically for Flickr8K.\n\nAdditional instructions include creating slides, verifying numbers, avoiding code screenshots, and selecting three models to showcase.\n\n(Source: Capstone Re