# Install and Import Library

In [None]:
!sudo apt-get update
!sudo apt-get install poppler-utils
!sudo apt-get install tesseract-ocr
!sudo apt-get install libtesseract-dev
!pip install "unstructured[all-docs]" pillow pydantic lxml matplotlib unstructured-pytesseract
!pip install chromadb grandalf langchain langchain-community langchain-groq langchain-pinecone pinecone-notebooks langchain-anthropic langchain-google-genai langchain-openai langchain-huggingface

In [None]:
from langchain_groq import ChatGroq
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFacePipeline, HuggingFaceEmbeddings

import os

from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = userdata.get('ANTHROPIC_API_KEY')
os.environ["PINECONE_API_KEY"] = userdata.get('PINECONE_API_KEY')

# MM Extraction

## Method 1

In [None]:
from unstructured.partition.pdf import partition_pdf

In [None]:
import os
import requests

def download_pdf(url, save_dir):
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Extract the filename from the URL
    filename = url.split("/")[-1]

    # Construct the full path to save the file
    file_path = os.path.join(save_dir, filename)

    try:
        # Send a GET request to the URL
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an error for bad status codes

        # Write the content to the file in chunks
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

        print(f"✅ File downloaded successfully: {file_path}")

        return file_path
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to download file: {e}")


# Example usage
pdf_url = "https://raw.githubusercontent.com/lokeshparab/GenAI-Full-Course/refs/heads/main/data/RAG_FOR_NLP.pdf"
destination_folder = "data"  # Replace with your desired directory path

filename = download_pdf(pdf_url, destination_folder)


In [None]:
raw_pdf_elements = partition_pdf(
    filename=filename,
    strategy="hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image","Table"],
    extract_image_block_to_payload=False,
    extract_image_block_output_dir="extracted_data",
    infer_table_structure=True,
)

In [None]:
Header=[]
Footer=[]
Title=[]
NarrativeText=[]
Text=[]
ListItem=[]
Image=[]
Table=[]
for element in raw_pdf_elements:
  if "unstructured.documents.elements.Header" in str(type(element)):
            Header.append(str(element))
  elif "unstructured.documents.elements.Footer" in str(type(element)):
            Footer.append(str(element))
  elif "unstructured.documents.elements.Title" in str(type(element)):
            Title.append(str(element))
  elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
            NarrativeText.append(str(element))
  elif "unstructured.documents.elements.Text" in str(type(element)):
            Text.append(str(element))
  elif "unstructured.documents.elements.ListItem" in str(type(element)):
            ListItem.append(str(element))
  elif "unstructured.documents.elements.Image" in str(type(element)):
            Image.append(str(element))
  elif "unstructured.documents.elements.Table" in str(type(element)):
            Table.append(str(element))

In [None]:
Header

In [None]:
Title

In [None]:
NarrativeText

In [None]:
Text

In [None]:
Table

In [None]:
Image

# Model loading

## Chat Model

In [None]:
llm = HuggingFaceEndpoint(
    repo_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    # repo_id = "perplexity-ai/r1-1776",
    task="text-generation"
  )
# llm=HuggingFaceEndpoint(repo_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",task="text-generation")
hf_model = ChatHuggingFace(llm=llm)

try:
  hf_model.invoke("Hi I am Lokesh")
except Exception as e:
  print(e)

In [None]:
gemini_model = ChatGoogleGenerativeAI(model='gemini-1.5-flash')
gemini_model.invoke("Hi I am Lokesh")

In [None]:
groq_model = ChatGroq(model="deepseek-r1-distill-llama-70b")
groq_model.invoke("Hi I am Lokesh")

In [None]:
open_ai_model = ChatOpenAI(model="gpt-4o")
open_ai_model.invoke("Hi I am Sunny")

In [None]:
claude_model = ChatAnthropic(model="claude-2")
claude_model.invoke("Hi I am Sunny")

In [None]:
llm = HuggingFacePipeline.from_model_id(
    model_id='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
    task='text-generation',
    pipeline_kwargs={
        "temperature": 0.5,
        "max_new_tokens":100
        }
)

model = ChatHuggingFace(llm=llm)
model.invoke("Hi I am Lokesh")

##Embeding Model

In [None]:


embedding_model = "text-embedding-3-large" 
dimensions = 64 
set_dimension = True 
query = "India is a growing country" 

if set_dimension:
  openai_embedding = OpenAIEmbeddings(
      model=embedding_model,
      dimensions=dimensions,
  )
else:
  openai_embedding = OpenAIEmbeddings(
      model=embedding_model,
  )

result = openai_embedding.embed_query(query)
print(len(result),result)

In [None]:


embedding_model = "models/gemini-embedding-exp-03-07" 
task_type = "retrieval_query"
transport = "None" 
query = "India is a growing country" 

func = lambda x : None if x=="None" else x
task_type = func(task_type)
transport = func(transport)

google_embedding = GoogleGenerativeAIEmbeddings(
    model=embedding_model,
    task_type=task_type,
    transport=transport
)

result = google_embedding.embed_query(query)
print(len(result),result)


In [None]:
# Hugging Face
model_name = "all-MiniLM-L6-v2" 
query = "India is a growing country" 
huggingface_embeddings=HuggingFaceEmbeddings(model_name=model_name)

result = huggingface_embeddings.embed_query(query)
print(len(result),result)

# Prepare for VB

## Summary for the Text

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [None]:
prompt_text="""You are an assistant tasked with summarizing text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text elements. \
    Give a concise summary of the table or text that is well optimized for retrieval.text: {element} """

prompt_template = ChatPromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt_template | open_ai_model | StrOutputParser()

summarize_chain.get_graph().print_ascii()

In [None]:
text_summary=summarize_chain.batch(NarrativeText,{"max_concurrency": 5})
text_summary

In [None]:
text_summary[3]

In [None]:
import re

def clean_text(text:str):
  return re.sub(r"<think>.*?</think>\s*","",text,flags=re.DOTALL)


## Summary for Table

In [None]:
prompt_text = """You are an AI Assistant tasked with summarizing tables for retrieval. \
    These summaries will be embedded and used to retrieve the raw table elements. \
    Give a concise summary of the table that is well optimized for retrieval. Table:{element} """

prompt = ChatPromptTemplate.from_template(prompt_text)
summarize_chain = {"element": lambda x: x} | prompt | open_ai_model | StrOutputParser()


summarize_chain.get_graph().print_ascii()

In [None]:
table_summaries = summarize_chain.batch(Table, {"max_concurrency": 5})
table_summaries

## Summary for Images

In [None]:
import base64, os
from langchain_core.messages import AIMessage, HumanMessage

def encode_image(image_path):
  with(open(image_path,"rb")) as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

def image_summarize(imag_base64,prompt):

  msg = gemini_model.invoke(
      [
          HumanMessage(
              content=[
                  {
                      "type":"text",
                      "text":prompt
                  },
                  {
                      "type":"image_url",
                      "image_url":{
                          "url": f"data:image/jpeg;base64,{imag_base64}"
                      }
                  }
              ]
          )
      ]
  )

  return msg.content

def generate_imag_summaries(path):

  image_base64_list = []

  image_summarises = []

  prompt = """You are an assistant tasked with summarizing images for retrieval. \
    These summaries will be embedded and used to retrieve the raw image. \
    Give a concise summary of the image that is well optimized for retrieval."""


  for img_file in sorted(os.listdir(path)):
    if img_file.endswith(".jpg") or img_file.endswith(".jpeg") or img_file.endswith(".png"):
      image_path = os.path.join(path, img_file)
      image_base64 = encode_image(image_path)
      image_base64_list.append(image_base64)
      image_summaries = image_summarize(image_base64,prompt)
      image_summarises.append(image_summaries)

  return image_base64_list, image_summarises

In [None]:

image_dir_path = "extracted_data"
image_base64_list, image_summarises = generate_imag_summaries(image_dir_path)

image_summarises

# Store data in vectordb

In [None]:
from langchain_core.documents import Document
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
import uuid

In [None]:
os.environ["GROQ_API_KEY"] = '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'


In [None]:
def create_multi_vector_retriver(
    vectorstore,
    text_summaries ,texts,
    table_summaries,tables,
    image_summarises, images
):

  retriever=MultiVectorRetriever(
      vectorstore=vectorstore,
      docstore=InMemoryStore(),
      id_key="doc_id",
  )

  def add_docs(retriever,doc_summaries,doc_contents):

    doc_ids = [ str(uuid.uuid4()) for _ in range(len(doc_summaries))]

    summary_docs = [
        Document(
            page_content=doc_summary,
            metadata={"doc_id":doc_id}
        )
        for doc_summary,doc_id in zip(doc_summaries,doc_ids)
    ]

    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(
        list(zip(doc_ids,doc_contents))
    )

  add_docs(retriever,text_summaries,NarrativeText)
  add_docs(retriever,table_summaries,Table)
  add_docs(retriever,image_summarises,image_summarises)

  return retriever

# embedding_model=load_model("embedding")

vectorstore=Chroma(collection_name="MMRAG",embedding_function=huggingface_embeddings)
retriever_multi_vector=create_multi_vector_retriver(
    vectorstore,
    text_summary,NarrativeText,
    table_summaries,Table,
    image_summarises, image_base64_list
)



In [None]:
retriever_multi_vector

In [None]:
retriever_multi_vector.get_relevant_documents(
    "Why We combine a pre-trained retriever (Query Encoder + Document Index) with a pre-trained seq2seq model (Generator) and fine-tune end-to-end?"
)

In [None]:
retriever_multi_vector.invoke(
    "Why We combine a pre-trained retriever (Query Encoder + Document Index) with a pre-trained seq2seq model (Generator) and fine-tune end-to-end?"
)

# Image data processing

In [None]:
import io, re
from IPython.display import HTML, display
from PIL import Image

In [None]:
def is_image_data(b64data)->bool:
  image_signatures = {
      b"\xFF\xD8\xFF": "jpg",
      b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
      b"\x47\x49\x46\x38": "gif",
      b"\x52\x49\x46\x46": "webp",
  }

  try:
    header = base64.b64decode(b64data)[:8] # Decode and get first 8 bytes
    for sig, format in image_signatures.items():
      if header.startswith(sig):
        return True
    return False
  except:
    return False

def looks_like_base64(sb):
  return re.match("[A-za-z0-9+/]+[=]{0,2}$",sb) is not None

def resize_base64_image(base64_string,size=(128,128)):
  img_data = base64.b64decode(base64_string)
  img = Image.open(io.BytesIO(img_data))
  resized_img = img.resize(size,Image.LANCZOS)

  buffered = io.BytesIO()
  resized_img.save(buffered, format=img.format)
  return base64.b64encode(buffered.getvalue()).decode('utf-8')

def split_image_text_types(docs):
  """
  Split base64-encoded images and text
  """

  b64_images = []
  texts = []

  for doc in docs:

    if isinstance(doc, Document):
      doc = doc.page_content
    if looks_like_base64(doc) and is_image_data(doc):
      doc =resize_base64_image(doc,size=(1300,600))
      b64_images.append(doc)
    else:
      texts.append(doc)

  return {
      "images":b64_images,
      "texts":texts
  }

def img_prompt_func(data_dict):
  """
  Join the context into a single string
  """

  print(data_dict)
  formatted_text = "\n".join(data_dict["context"]["texts"])

  messages = []

  if data_dict["context"]["images"]:
    for img in data_dict["context"]["images"]:
      messages.append(
          {
              "type":"image_url",
              "image_url":{
                  "url": f"data:image/jpeg;base64,{img}"
              }
          }
      )
  text_message = {
      "type":"text",
      "text": (
          "You are a helpful assistant.\n"
          "You will be given a mixed info(s) .\n"
          "Use this information to provide relevant information to the user question. \n"
          f"User-provided question: {data_dict['question']}\n\n"
          "Text and / or tables:\n"
          f"{formatted_text}"
      ),
  }

  messages.append(text_message)

  return [HumanMessage(content=messages)]



In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

def muti_model_rag_chain(retriever):
  chain = (
      {
          "context":retriever | RunnableLambda(split_image_text_types),
          "question":RunnablePassthrough(),

      }
      | RunnableLambda(img_prompt_func)
      | open_ai_model
      | StrOutputParser()
  )

  return chain

chain_mm_rag = muti_model_rag_chain(retriever_multi_vector)

chain_mm_rag.get_graph().print_ascii()

In [None]:
chain_mm_rag.invoke(
    """Explain any images / figures in the paper with Left: NQ performance as more documents are retrieved. Center: Retrieval recall performance\
in NQ. Right: MS-MARCO Bleu-1 and Rouge-L as more documents are retrieved.
  """
)