In [None]:
! pip install -U langchain openai chromadb langchain-experimental # (newest versions required for multi-modal)




In [None]:
!pip install langchain-google-genai



In [None]:
! pip install "unstructured[all-docs]" pillow pydantic lxml pillow matplotlib chromadb tiktoken



In [None]:
!apt-get install poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.4 [186 kB]
Fetched 186 kB in 0s (444 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 121918 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.4_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.4) ...
Setting up poppler-utils (22.02.0-2ubuntu0.4) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!sudo apt install tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (6,067 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [54]:
import os
from langchain_text_splitters import CharacterTextSplitter
from unstructured.partition.pdf import partition_pdf

# Extract elements from PDF
def extract_pdf_elements(path, fname):
    """
    Extract images, tables, and chunk text from a PDF file.
    path: File path, which is used to dump images (.jpg)
    fname: File name
    """
    return partition_pdf(
        filename=os.path.join(path, fname),
        extract_images_in_pdf=True,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
    )

# Categorize elements by type
def categorize_elements(raw_pdf_elements):
    """
    Categorize extracted elements from a PDF into tables and texts.
    raw_pdf_elements: List of unstructured.documents.elements
    """
    tables = []
    texts = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))
    return texts, tables


# File path
fpath =  os.getcwd()
fname = "space_F3_1_3.pdf" ### Replace with file name

# Get elements
raw_pdf_elements = extract_pdf_elements(fpath, fname)

# Get text, tables
texts, tables = categorize_elements(raw_pdf_elements)

# Optional: Enforce a specific token size for texts
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4000, chunk_overlap=0
)
joined_texts = " ".join(texts)
texts_4k_token = text_splitter.split_text(joined_texts)

In [55]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI


# Generate summaries of text elements
def generate_text_summaries(texts, tables, summarize_texts=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize_texts: Bool to summarize texts
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatGoogleGenerativeAI(temperature=0, model="gemini-pro", google_api_key='')
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    table_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize_texts:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
    elif texts:
        text_summaries = texts

    # Apply to tables if tables are provided
    if tables:
        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

    return text_summaries, table_summaries


# Get text, table summaries
text_summaries, table_summaries = generate_text_summaries(
    texts_4k_token, tables, summarize_texts=True
)

In [56]:
import base64
from langchain_core.messages import HumanMessage


def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def image_summarize(img_base64, prompt):
    """Make image summary"""
    chat = ChatGoogleGenerativeAI(model="gemini-pro-vision", max_tokens=1024, google_api_key='')

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content


def generate_img_summaries():
    """
    Generate summaries and base64 encoded strings for images
    path: Path to list of .jpg files extracted by Unstructured
    """

    # Store base64 encoded images
    img_base64_list = []

    # Store image summaries
    image_summaries = []

    # Prompt
    prompt = """You are an assistant tasked with summarizing images for retrieval. \
    These summaries will be embedded and used to retrieve the raw image. \
    Give a concise summary of the image that is well optimized for retrieval."""

    # Apply to images
    for img_file in sorted(os.listdir('figures')):
        if img_file.endswith(".jpg"):
            img_path = os.path.join('figures', img_file)
            base64_image = encode_image(img_path)
            img_base64_list.append(base64_image)
            image_summaries.append(image_summarize(base64_image, prompt))

    return img_base64_list, image_summaries


# Image summaries
img_base64_list, image_summaries = generate_img_summaries()

In [57]:
len(image_summaries)

24

In [58]:
import shutil

shutil.rmtree('figures')

In [None]:
!pip install langchain-openai


Collecting langchain-openai
  Downloading langchain_openai-0.1.7-py3-none-any.whl (34 kB)
Installing collected packages: langchain-openai
Successfully installed langchain-openai-0.1.7


In [59]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

def create_multi_vector_retriever(vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images):
    """Create retriever that indexes summaries, but returns raw images or texts"""
    store = InMemoryStore()
    id_key = "doc_id_test_3"

    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(doc_summaries)]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever


# The vectorstore to use to index the summaries
embeddings = OpenAIEmbeddings(openai_api_key = "")
vectorstore = Chroma(
    collection_name="doc_id_test_2", embedding_function=embeddings
)

# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
    vectorstore,
    text_summaries,
    texts,
    table_summaries,
    tables,
    image_summaries,
    img_base64_list,
)

In [60]:
chat_history = []

In [61]:
import io
import re

from IPython.display import HTML, display
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from PIL import Image


def plt_img_base64(img_base64):
    """Disply base64 encoded string as image"""
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))


def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None


def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xff\xd8\xff": "jpg",
        b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False


def resize_base64_image(base64_string, size=(128, 128)):
    """
    Resize an image encoded as a Base64 string
    """
    # Decode the Base64 string
    img_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(img_data))

    # Resize the image
    resized_img = img.resize(size, Image.LANCZOS)

    # Save the resized image to a bytes buffer
    buffered = io.BytesIO()
    resized_img.save(buffered, format=img.format)

    # Encode the resized image to Base64
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def split_image_text_types(docs):
    """
    Split base64-encoded images and texts
    """
    b64_images = []
    texts = []
    for doc in docs:
        # Check if the document is of type Document and extract page_content if so
        if isinstance(doc, Document):
            doc = doc.page_content
        if looks_like_base64(doc) and is_image_data(doc):
            doc = resize_base64_image(doc, size=(1300, 600))
            b64_images.append(doc)
        else:
            texts.append(doc)
    return {"images": b64_images, "texts": texts}


def img_prompt_func(data_dict):
    """
    Join the context into a single string
    """
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    chat_history = "\n".join([f"Q: {chat['query']}\nA: {chat['response']}" for chat in data_dict["context"]["chat_history"]])
    messages = []


    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            image_message = {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}"},
            }
            messages.append(image_message)

    # Adding the text for analysis
    text_message = {
        "type": "text",
        "text":  (
            "You are an intelligent chatbot that has the ability to provide the perfect answer to user provided question based on the context given and the prevoius chat history .\n"
            "You will be given a mixed of text, tables, and images(photographs, graphs, charts)."
            "All the above text, tables, images will be retrieved from a vectorstore based on user-input keywords."
            "Please use your extensive knowledge and analytical skills to provide a comprehensive summary that includes:\n"
            "- A detailed description of the visual elements in the image.\n"
            "- Any insights and statistics, if they are charts and graphs.\n"
            "- Connections between the image and the related text.\n\n"
            f"User-provided question: {data_dict['question']}\n\n"
            "Previous chat Conversation context:\n"
            f"{chat_history}\n\n"
            "Current Text and / or tables context:\n"
            f"{formatted_texts}"
        ),
    }
    messages.append(text_message)

    return [HumanMessage(content=messages)]


def multi_modal_rag_chain(retriever):
    """
    Multi-modal RAG chain
    """

    # Multi-modal LLM
    model = ChatGoogleGenerativeAI(temperature=0, model="gemini-pro-vision", google_api_key='') ###Will work if image is retrieved otherwise use gemini-pro or gpt4v

    def combined_context(data_dict):
        context = {
            "texts": data_dict.get("texts", []),
            "images": data_dict.get("images", []),
            "chat_history": chat_history
        }
        return context

    # RAG pipeline
    chain = (
        {
            "context": retriever | RunnableLambda(split_image_text_types) | RunnableLambda(combined_context),
            "question": RunnablePassthrough(),
        }
        | RunnableLambda(img_prompt_func)
        | model
        | StrOutputParser()
    )

    return chain


# Create RAG chain
chain_multimodal_rag = multi_modal_rag_chain(retriever_multi_vector_img)

In [62]:
query = 'Who is the first human in space orbiting the earth once?'
response = chain_multimodal_rag.invoke(query)

In [63]:
response

" The first human to orbit the Earth was Yuri Alekseyevich Gagarin, a Soviet cosmonaut who completed one orbit of Earth on April 12, 1961. Gagarin was launched into space in the Vostok 1 spacecraft, which was developed by the Soviet Union. The Vostok 1 spacecraft was launched from the Baikonur Cosmodrome in Kazakhstan, and it completed one orbit of Earth in 108 minutes. Gagarin's flight was a major milestone in the history of space exploration, and it paved the way for future human spaceflight missions.\n\nThe image shows a black and white photograph of Gagarin in his Vostok 1 spacesuit. Gagarin is smiling and looking directly at the camera. The photograph was taken shortly before Gagarin's launch into space.\n\nThe image is a significant historical document, as it captures the moment when Gagarin became the first human to orbit the Earth. The image is also a reminder of the courage and determination of the early space explorers."

In [64]:
chat_history.append({"query": query, "response": response})


In [65]:
chat_history

[{'query': 'Who is the first human in space orbiting the earth once?',
  'response': " The first human to orbit the Earth was Yuri Alekseyevich Gagarin, a Soviet cosmonaut who completed one orbit of Earth on April 12, 1961. Gagarin was launched into space in the Vostok 1 spacecraft, which was developed by the Soviet Union. The Vostok 1 spacecraft was launched from the Baikonur Cosmodrome in Kazakhstan, and it completed one orbit of Earth in 108 minutes. Gagarin's flight was a major milestone in the history of space exploration, and it paved the way for future human spaceflight missions.\n\nThe image shows a black and white photograph of Gagarin in his Vostok 1 spacesuit. Gagarin is smiling and looking directly at the camera. The photograph was taken shortly before Gagarin's launch into space.\n\nThe image is a significant historical document, as it captures the moment when Gagarin became the first human to orbit the Earth. The image is also a reminder of the courage and determination o

In [66]:
query = 'name of the space craft he travelled in?'
response = chain_multimodal_rag.invoke(query)
response

" The Vostok 1 spacecraft was a single-person spacecraft that was designed to carry a human being into space. The spacecraft was launched by a R-7 rocket, and it consisted of a spherical capsule that was attached to a cylindrical service module. The capsule was made of aluminum alloy, and it had a diameter of 2.3 meters. The service module contained the spacecraft's engines, fuel, and life support systems.\n\nThe Vostok 1 spacecraft was launched from the Baikonur Cosmodrome in Kazakhstan, and it completed one orbit of Earth in 108 minutes. Gagarin's flight was a major milestone in the history of space exploration, and it paved the way for future human spaceflight missions."

In [67]:
chat_history.append({"query": query, "response": response})


In [68]:
chat_history

[{'query': 'Who is the first human in space orbiting the earth once?',
  'response': " The first human to orbit the Earth was Yuri Alekseyevich Gagarin, a Soviet cosmonaut who completed one orbit of Earth on April 12, 1961. Gagarin was launched into space in the Vostok 1 spacecraft, which was developed by the Soviet Union. The Vostok 1 spacecraft was launched from the Baikonur Cosmodrome in Kazakhstan, and it completed one orbit of Earth in 108 minutes. Gagarin's flight was a major milestone in the history of space exploration, and it paved the way for future human spaceflight missions.\n\nThe image shows a black and white photograph of Gagarin in his Vostok 1 spacesuit. Gagarin is smiling and looking directly at the camera. The photograph was taken shortly before Gagarin's launch into space.\n\nThe image is a significant historical document, as it captures the moment when Gagarin became the first human to orbit the Earth. The image is also a reminder of the courage and determination o

In [71]:
query = 'In which year he completed one orbit of earth?'
response = chain_multimodal_rag.invoke(query)
response

" The first human to orbit the Earth was Yuri Alekseyevich Gagarin, a Soviet cosmonaut who completed one orbit of Earth on April 12, 1961. Gagarin was launched into space in the Vostok 1 spacecraft, which was developed by the Soviet Union. The Vostok 1 spacecraft was launched from the Baikonur Cosmodrome in Kazakhstan, and it completed one orbit of Earth in 108 minutes. Gagarin's flight was a major milestone in the history of space exploration, and it paved the way for future human spaceflight missions."

In [72]:
chat_history.append({"query": query, "response": response})


In [73]:
chat_history

[{'query': 'Who is the first human in space orbiting the earth once?',
  'response': " The first human to orbit the Earth was Yuri Alekseyevich Gagarin, a Soviet cosmonaut who completed one orbit of Earth on April 12, 1961. Gagarin was launched into space in the Vostok 1 spacecraft, which was developed by the Soviet Union. The Vostok 1 spacecraft was launched from the Baikonur Cosmodrome in Kazakhstan, and it completed one orbit of Earth in 108 minutes. Gagarin's flight was a major milestone in the history of space exploration, and it paved the way for future human spaceflight missions.\n\nThe image shows a black and white photograph of Gagarin in his Vostok 1 spacesuit. Gagarin is smiling and looking directly at the camera. The photograph was taken shortly before Gagarin's launch into space.\n\nThe image is a significant historical document, as it captures the moment when Gagarin became the first human to orbit the Earth. The image is also a reminder of the courage and determination o

In [75]:
query = 'how many years have gone past from that year to 2021?'
response = chain_multimodal_rag.invoke(query)
response

" The first human to orbit the Earth was Yuri Alekseyevich Gagarin, a Soviet cosmonaut who completed one orbit of Earth on April 12, 1961. Gagarin was launched into space in the Vostok 1 spacecraft, which was developed by the Soviet Union. The Vostok 1 spacecraft was launched from the Baikonur Cosmodrome in Kazakhstan, and it completed one orbit of Earth in 108 minutes. Gagarin's flight was a major milestone in the history of space exploration, and it paved the way for future human spaceflight missions.\n\nThe image shows a black and white photograph of Gagarin in his Vostok 1 spacesuit. Gagarin is smiling and looking directly at the camera. The photograph was taken shortly before Gagarin's launch into space.\n\nThe image is a significant historical document, as it captures the moment when Gagarin became the first human to orbit the Earth. The image is also a reminder of the courage and determination of the early space explorers.\n\n60 years have passed from that year to 2021."