## 1. Libraries

In [1]:
import torch
from transformers import LayoutLMv2Processor, AutoModelForTokenClassification, LayoutLMv2ForTokenClassification, AutoTokenizer
import pdfplumber
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
from io import BytesIO
import pytesseract
import numpy as np
import cv2
import base64
import glob
import os
from typing import Any
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
import nltk
import openai
import uuid
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document as LangchainDocument

In [2]:
# Ensure PyTorch uses GPU if available on MacBook with MPS or fallback to CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


## 2.Setting Paths and Extracting PDF Content with Detailed Configuration

- This code sets up paths for NLTK data and image extraction, then processes a PDF using partition_pdf.
- It extracts text and images from the PDF with detailed settings, saving images to a specified directory and handling text chunking for efficient processing.

In [4]:
path = "your_path/Multi_modal_vector_search/"

In [5]:
# Set the NLTK data path
nltk_data_path = "../nltk_data/nltk_data"
nltk.data.path.append(nltk_data_path)

# Define the path to the directory containing extracted images
IMG_DIR = "your_path/Multi_modal_vector_search/images/"
os.makedirs(IMG_DIR, exist_ok=True)

# Re-run partition_pdf with detailed settings
raw_pdf_elements = partition_pdf(
    filename=path + "LLaMA2.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=IMG_DIR,
    extract_image_block_types=["Image"],
    extract_image_block_to_payload=True
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 2. Examining the PDF elements
- Categorizing PDF Elements and Counting Tables and Text:
    - This code analyzes the PDF elements by counting their occurrences and categorizing them into "table" and "text" types using a custom Element class.
    - It extracts the number of tables and text elements separately to understand the content structure.

In [7]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 111,
 "<class 'unstructured.documents.elements.Table'>": 44,
 "<class 'unstructured.documents.elements.TableChunk'>": 6}

In [8]:
# Define a custom Element class for structured categorization
class Element(BaseModel):
    type: str
    text: Any

# Categorize PDF elements by type (table or text)
categorized_elements = []
for element in raw_pdf_elements:
    # If the element is of type 'Table', categorize as table
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    # If the element is of type 'CompositeElement', categorize as text
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Extract and count table elements
table_elements = [e for e in categorized_elements if e.type == "table"]
print("Number of table elements:", len(table_elements))

# Extract and count text elements
text_elements = [e for e in categorized_elements if e.type == "text"]
print("Number of text elements:", len(text_elements))

Number of table elements: 50
Number of text elements: 111


### Quick check to see the table elements too for validation

In [20]:
table_elements[2]

Element(type='table', text='Time Power Carbon Emitted (GPU hours) Consumption (W) (tCOzeq) 7B 184320 400 31.22 L 13B 368640 400 62.44 TAMA 2 348 1038336 350 153.90 70B 1720320 400 291.42 Total 3311616 539.00')

## 3. Multi-vector retriever

In [28]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from function_calling1 import _parse_google_docstring  # Use the local file

In [29]:
from langchain_openai import ChatOpenAI

In [30]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
OPENAI_API_KEY = "you_api_key"
openai.api_key = OPENAI_API_KEY

# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4o",api_key=OPENAI_API_KEY, max_tokens=1500 )
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [31]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [32]:
table_summaries[2]

'The table summarizes the power consumption and carbon emissions for different models based on GPU hours. The models listed are 7B, 13B, TAMA 2, and 70B. The power consumption for each model is 400W, except for TAMA 2, which consumes 350W. The carbon emissions (in tCO2eq) for each model are as follows: 7B emits 31.22 tCO2eq, 13B emits 62.44 tCO2eq, TAMA 2 emits 153.90 tCO2eq, and 70B emits 291.42 tCO2eq. The total power consumption across all models is 3,311,616 GPU hours, resulting in a total carbon emission of 539.00 tCO2eq.'

In [33]:
# Creating the concatenated text
output_text = ""
for i, summary in enumerate(table_summaries, 1):
    output_text += f"Table {i} details:\n{summary}\n\n\n"

# Save the concatenated text to a file
with open('table_summaries.txt', 'w') as file:
    file.write(output_text)

In [34]:
# Creating the concatenated text
output_text = ""
for i, summary in enumerate(table_summaries, 1):
    output_text += f"Table {i} details:\n{summary}\n\n\n"

# Save the concatenated text to a file
with open('table_summaries.txt', 'w') as file:
    file.write(output_text_2)

In [35]:
# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [36]:
text_summaries[2]

'Large Language Models (LLMs) have demonstrated significant potential as AI assistants, excelling in complex reasoning tasks across various domains. They interact with users through chat interfaces, leading to widespread adoption. LLMs are trained using auto-regressive transformers on large datasets, followed by alignment with human preferences through techniques like Reinforcement Learning with Human Feedback (RLHF). However, the high computational demands restrict their development to a few entities.\n\nPublicly released LLMs like BLOOM, LLaMa-1, and Falcon match the performance of closed models like GPT-3 and Chinchilla but are not as fine-tuned for usability and safety as closed models such as ChatGPT, BARD, and Claude. Fine-tuning these models involves significant computational and human annotation costs, limiting transparency and reproducibility.\n\nThis work introduces Llama 2, a family of pretrained and fine-tuned LLMs, including Llama 2 and Llama 2-Cuar, with up to 70 billion 

## 3.2. Extracting images and summarizing them

- This section extracts images from PDF files and uses pytesseract for Optical Character Recognition (OCR) to obtain text from the images.
- The extracted text is then combined with image details to create a summary using GPT-4, which generates a detailed 5-point summary based on the content and context of the image.

In [37]:
# Define the path for the output directory
import fitz 
# Define the path to the directory containing extracted images
IMG_DIR = "your_path/Multi_modal_vector_search/images/"
os.makedirs(IMG_DIR, exist_ok=True)

In [38]:
# Open the PDF
pdf_document = fitz.open("your_path/Multi_modal_vector_search/LLaMA2.pdf")

In [39]:
# Function to save image as PNG
def save_image_as_png(image_bytes, page_number, image_index):
    image_filename = f"image_{page_number}_{image_index}.png"
    image_path = os.path.join(IMG_DIR, image_filename)
    with open(image_path, "wb") as image_file:
        image_file.write(image_bytes)

# Extract images
for page_number in range(len(pdf_document)):
    page = pdf_document[page_number]
    image_list = page.get_images(full=True)
    xrefs = [img[0] for img in image_list]
    
    for image_index, xref in enumerate(xrefs):
        base_image = pdf_document.extract_image(xref)
        image_bytes = base_image["image"]
        save_image_as_png(image_bytes, page_number, image_index)

    # Additional extraction for figures that are not direct images
    for block in page.get_text("dict")["blocks"]:
        if "lines" in block:  # Check if the block contains text lines
            for line in block["lines"]:
                for span in line["spans"]:
                    if "font" in span and "image" in span["font"]:  # Check if span contains an image
                        image_bytes = span["image"]
                        save_image_as_png(image_bytes, page_number, image_index)
                        image_index += 1

print("Image extraction complete.")

Image extraction complete.


## 3.3 Text summary of each image

- For each extracted image, the text obtained through OCR is summarized using GPT-4.
- The summary provides insights into the type of image (e.g., graph, table), relevant labels, and key metrics, enabling a more comprehensive understanding of the visual content.

In [40]:
# Function to resize and compress image
def resize_and_compress_image(image_path, max_size=(500, 500)):
    with Image.open(image_path) as img:
        img.thumbnail(max_size)
        buffer = BytesIO()
        img.save(buffer, format="PNG")
        return buffer.getvalue()

def generate_image_summary(image_path):
    image = Image.open(image_path)
    ocr_text = pytesseract.image_to_string(image)
    resized_image_bytes = resize_and_compress_image(image_path)
    encoded_image = base64.b64encode(resized_image_bytes).decode('utf-8')
    
    prompt = (
        "You are an assistant tasked with summarizing images. "
        "The image you're provided with is from a research paper. "
        "It can have graphs of various kinds, like heat-maps, histograms, scatter plots etc. "
        "Identify the type of graph, go through the X and Y coordinates and the labels and try to understand what is being plotted. "
        "From the metrics in the graph (i.e., the numerical values), try to identify some max and min values. "
        "Use the above data AND your understanding to present a 5-point summary IN DETAIL.\n\n"
        f"Extracted Text: {ocr_text}"
        f"Image: {encoded_image}"
    )
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an assistant tasked with summarizing images. Provide a detailed description of the image."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.1,
        max_tokens=500,
    )
    summary = response.choices[0].message.content
    return summary


## 3.4 Generating and Storing Image Summaries

- This code generates summaries for all extracted images by calling the OpenAI GPT-4 API, saving each summary in a corresponding text file.
- It reads back the summaries, stores them in a list, and cleans up any residual logging information. This step ensures accurate and organized summaries, ready for further analysis or display.

In [None]:
from openai import OpenAI
OPENAI_API_TOKEN = "your-api-key"
client = OpenAI(api_key=OPENAI_API_TOKEN)
# Generate summaries for all images and store in respective text files
for img_path in os.listdir(IMG_DIR):
    if img_path.endswith(".png"):
        full_img_path = os.path.join(IMG_DIR, img_path)
        summary = generate_image_summary(full_img_path)
        
        # Store the summary in a text file
        base_name = os.path.splitext(img_path)[0]
        summary_path = os.path.join(IMG_DIR, f"{base_name}_summary.txt")
        with open(summary_path, "w") as summary_file:
            summary_file.write(summary)

In [45]:
# Read the summaries back from the text files
file_paths = glob.glob(os.path.expanduser(os.path.join(IMG_DIR, "*_summary.txt")))
# Store the content of each summary file in a list
img_summaries = []
for file_path in file_paths:
    with open(file_path, "r") as file:
        img_summaries.append(file.read())

# Clean up residual logging if needed
cleaned_img_summary = [
    s.split("clip_model_load: total allocated memory: 201.27 MB\n\n", 1)[1].strip()
    if "clip_model_load: total allocated memory" in s else s
    for s in img_summaries
]

print("Image summaries generated and cleaned.")

Image summaries generated and cleaned.


In [46]:
cleaned_img_summary[4]

'### Detailed Summary of the Image:\n\n1. **Type of Graph**: The image appears to be a complex visual representation from a research paper, likely containing multiple types of graphs such as scatter plots, histograms, and heat maps. The presence of various data points and color gradients suggests a detailed analysis of a specific dataset.\n\n2. **X and Y Coordinates and Labels**: The X and Y coordinates are not explicitly mentioned in the extracted text, but typical research graphs would have these axes labeled with relevant metrics such as time, frequency, or other measurable variables. The labels would provide context to the data being analyzed, such as "Time (s)" on the X-axis and "Frequency (Hz)" on the Y-axis for a scatter plot.\n\n3. **Understanding the Plotted Data**: The data plotted likely represents a detailed analysis of a specific phenomenon or experiment. For instance, a heat map might show the intensity of a particular variable across different conditions, while a scatter

## 4. Adding to vectorstore
- This section focuses on generating embeddings for different types of content (text, tables, and images) and storing them in a FAISS vector store.
- This enables efficient retrieval of multi-modal data based on similarity search.

### 4.1 Initializing OpenAI Embeddings and FAISS Index

In [48]:
# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)
# Initialize FAISS index
dimension = 1536  # Dimension of the embeddings from text-embedding-ada-002
index = faiss.IndexFlatL2(dimension)

#### `Description:`
The embeddings are initialized using the text-embedding-ada-002 model from OpenAI, setting the dimension to 1536. A FAISS index is also created to store these embeddings, enabling efficient similarity search for multi-modal data.



### 4.2 Creating the Document Store and FAISS Vector Store

In [49]:
# Create the docstore
docstore = InMemoryDocstore({})

# Create the FAISS vector store
vectorstore = FAISS(
    embedding_function=embeddings.embed_query,
    index=index,
    docstore=docstore,
    index_to_docstore_id={}
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


#### `Description:` 
An in-memory document store is created using InMemoryDocstore, while the FAISS vector store is initialized to handle embedding storage, retrieval, and document management. The embeddings function is set to the embed_query function, which converts queries into vector embeddings for similarity search.

### 4.3 Adding Documents, Tables, and Images to Vector Store

In [50]:
# Helper function to add documents to the vectorstore
def add_documents_to_vectorstore(docs, doc_ids):
    documents = [
        LangchainDocument(page_content=doc, metadata={"doc_id": doc_ids[i]})
        for i, doc in enumerate(docs)
    ]
    vectorstore.add_documents(documents)


In [51]:
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
add_documents_to_vectorstore(text_summaries, doc_ids)

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
add_documents_to_vectorstore(table_summaries, table_ids)

# Add images
img_ids = [str(uuid.uuid4()) for _ in cleaned_img_summary]
add_documents_to_vectorstore(cleaned_img_summary, img_ids)

print("Embeddings for text, tables, and images have been generated and stored in FAISS.")

Embeddings for text, tables, and images have been generated and stored in FAISS.


## 5. Creating the RAG Pipeline

- This section sets up a Retrieval-Augmented Generation (RAG) pipeline, which uses retrieved information from the vector store to generate specific answers to user queries using GPT-4.
- The pipeline integrates multiple components, including a prompt, retriever, and language model, to handle natural language queries and return accurate response

### 5.1 Defining the RAG Prompt and Model

In [52]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

In [59]:
# Define the RAG pipeline
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
Give specific answers.
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(temperature=0, model="gpt-4o",api_key=OPENAI_API_KEY, max_tokens=1500 )

#### `Description:` 
A prompt template is created for generating answers based only on the retrieved context, focusing on specificity. The ChatGPT-4 model is initialized with the template to generate responses. The model’s temperature is set to 0 for deterministic output.



### 5.2 Initializing the Retriever

In [60]:
# Define the retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

### `Description:` 
The retriever is set up to fetch the top 5 most similar results from the FAISS vector store using similarity search. This ensures that the most relevant content (text, tables, images) is provided to the RAG pipeline.

### 5.3 Building and Executing the RAG Pipeline

In [None]:
# Create the RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [61]:
chain.invoke("What is the value of CO2 emissions during pretraining for Llama-2 7B chat model")

'The CO2 emissions during pretraining for the Llama-2 models are estimated at 539 tCO2eq.'

In [64]:
chain.invoke("What is the value of GPU hours for pretraining for Llama-2 7B chat model? Give the specific number")

'The value of GPU hours for pretraining the Llama-2 7B chat model is 3,311,616 GPU hours.'

In [65]:
chain.invoke("Which model performed the best for 'Common sense reasoning' on the Overall performance on grouped benchmarks")

"The model that performed the best for 'Common sense reasoning' on the overall performance on grouped benchmarks is LAMA2 70B, with a score of 37.5."

### Optional - Not tested completely

In [None]:
%%writefile app.py

import streamlit as st
import openai
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# Set OpenAI API key
OPENAI_API_KEY = "your-api-key"
openai.api_key = OPENAI_API_KEY

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

# Initialize FAISS index
dimension = 1536  # Dimension of the embeddings from text-embedding-ada-002
index = faiss.IndexFlatL2(dimension)

# Create the docstore
docstore = InMemoryDocstore({})

# Create the FAISS vector store
vectorstore = FAISS(
    embedding_function=embeddings.embed_query,
    index=index,
    docstore=docstore,
    index_to_docstore_id={}
)

# Define the retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Define the RAG pipeline
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Initialize the model (using GPT-4 in this case)
model = ChatOpenAI(model="gpt-4", openai_api_key=OPENAI_API_KEY)

# Create the RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Streamlit app
st.title("QA Bot using GPT-4 and FAISS")

# Input field for the user question
question = st.text_input("Enter your question:")

if question:
    # Ensure question is a string
    question = str(question)
    
    # Perform a search in the FAISS vector store
    try:
        search_results = retriever.get_relevant_documents(question)
    except Exception as e:
        st.error(f"Error in retrieving documents: {str(e)}")
    
    if search_results:
        # Combine the content of the top results to provide context for the LLM
        context = " ".join([result.page_content for result in search_results])
        
        # Ensure context is a string
        context = str(context)
        
        # Run the RAG pipeline
        try:
            response = chain.invoke({"context": context, "question": question})
            st.write("Answer:")
            st.write(response)
        except Exception as e:
            st.error(f"Error in RAG pipeline invocation: {str(e)}")
            st.write(f"Context: {context}")
            st.write(f"Question: {question}")
    else:
        st.error("No search results found.")