In [30]:
import gdown

# Example URL (replace with your actual file ID)
url = 'https://drive.google.com/uc?id=1yVbhJWh4L1unDbDT4APOusTXlwic7aE9'

# Output file path
output = 'PA - Consolidated lecture notes.pdf'

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1yVbhJWh4L1unDbDT4APOusTXlwic7aE9
To: d:\scaler\GenAI\Multimodal-RAG\notebook\PA - Consolidated lecture notes.pdf
100%|██████████| 4.28M/4.28M [00:01<00:00, 4.24MB/s]


'PA - Consolidated lecture notes.pdf'

In [31]:
import fitz  # PyMuPDF wrapper for simplicity
import os

def extract_images_from_pdf(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through each page
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        image_list = page.get_images(full=True)

        # Extract images from the page
        for img_index, img in enumerate(image_list):     # This nested loop iterates through each image in the image_list.
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"] #image key ectracts the actual image
            image_ext = base_image["ext"] # ext key extracts file format (.jpg)
            image_filename = f"page_{page_number+1}_img_{img_index+1}.{image_ext}" #appropriate path
            image_filepath = os.path.join(output_folder, image_filename)

            # Save the image
            with open(image_filepath, "wb") as image_file:
                image_file.write(image_bytes)

            print(f"Extracted {image_filename}")

    print("Image extraction complete.")

# Example usage
pdf_path = "PA - Consolidated lecture notes.pdf"
output_folder = "extracted_images"
extract_images_from_pdf(pdf_path, output_folder)


Extracted page_4_img_1.png
Extracted page_5_img_1.png
Extracted page_6_img_1.png
Extracted page_6_img_2.png
Extracted page_10_img_1.png
Extracted page_11_img_1.png
Extracted page_14_img_1.png
Extracted page_16_img_1.png
Extracted page_17_img_1.png
Extracted page_19_img_1.png
Extracted page_19_img_2.png
Extracted page_20_img_1.png
Extracted page_24_img_1.png
Extracted page_25_img_1.png
Extracted page_25_img_2.png
Extracted page_25_img_3.png
Extracted page_27_img_1.png
Extracted page_29_img_1.png
Extracted page_30_img_1.png
Extracted page_32_img_1.png
Image extraction complete.


In [32]:
import base64

IMAGE_PATH = "extracted_images/page_30_img_1.png"

# Open the image file and encode it as a base64 string
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

base64_image = encode_image(IMAGE_PATH)

In [33]:
print(base64_image)  # Print the base64 string to the console

iVBORw0KGgoAAAANSUhEUgAAAlgAAAJtCAIAAABls5mVAAAACXBIWXMAAA7EAAAOxAGVKw4bAAITpUlEQVR4nOy9CaxVxbb3ixqxCccmKBoUDKJGUIOAAdTYBruAwlEUQ6NBgYhNFOFTmociT47yFCGAGEB9NHpE0A9FnjR+IhAQuAJyaS/toZEjoAdRLmz2dq1V78f8nzlu7bn2WmyavWl2/bOzsvZcNatGjaoa/zGqatas5AICAgICAiowKh1rAQICAgICAo4lAhEGBAQEBFRoBCIMCAgICKjQCEQYEBAQEFChEYgwICAgIKBCIxBhQEBAQECFRiDCgICAgIAKjUCEAQEBAQEVGoEIAwICAgIqNAIRBgQEBARUaAQiDAgICAio0AhEGBAQEBBQoRGIMCAgICCgQiMQYUBAQEBAhUYgwoCAgICACo1AhAEBAQEBFRqBCAMCAgICKjQCEQYEBAQEVGgEIgwICAgIqNAIRBgQEBAQUKERiDAgICAgoEIjEGFAQEBAQIVGIMKAgICAgAqNQIQBAQEBARUagQgDAgICAio0AhEGBAQEBFRoBCIMCAgICKjQCEQYEBAQEFChEYgwICAgIKBCIxBhxUImxrEWJCAgIOB4QSDCioVAhAEBAQEJBCKsQMjkwLGWKyAgIOBYIhDhSQ6f8NLpdCDCgICAgAQCEZ7wKJHJRHupP1NFRUWFhYX7I/BvOkYgwoCAgAAhEGHZAso5tkX84SHtIRDhEQKvIsTWAQEnB8qbCIlL7PvevXv1JTF9RxDDFz75qShCOQt5JFAFzSaqLmVqIiliz549u3bt2rBhw+rVqzdu3Lhjxw7M9O7du/l18uTJtWrVqlSpUsOGDceNG+di4gykmAsoYd++fc5rRIXUlkAuBVeytRd0GBBwIqJciXD79u2LFi1at27dkiVLMNmYG83X+UakoKCAlD/++OPatWuXLl3K5++//16eQh4JqM7mzZtXrlyJ2FRwy5Ytmo0s

In [34]:
from openai import OpenAI
import os
client = OpenAI(api_key=os.getenv("openai_apikey"))


def describe_image(base64_image):
    """
    Uses OpenAI's GPT-4o model to generate a description of the image.
    """
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        { "role": "system", "content": "Your job is to extract all the information from the images, includng the text. Extract all the text from the image without changing the order or structure of the information. recheck if all the text has been extracted correctly and return in the same presentation and structure as present in the original image. "},
         { "role": "user",
          "content": [
            {"type": "text", "text": "extract ALL the text from the image in the same structure as present in the image. and then after it summarise everything in brief, do not miss anything "},
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/png;base64,{base64_image}",
              },
            },
          ],
        }
      ],
      max_tokens=300,
    )
    #print("Chat GPT:")
    #print(response.choices[0].message.content)
    return response.choices[0].message.content

print(describe_image(base64_image))

How India pays?

Average Ticket Size of payment transactions analysed for December 2021

₹4122  
Credit cards

₹1804  
Credit cards

₹421  
Prepaid cards

₹2650  
UPI P2P

₹786  
UPI P2M

₹375  
Mobile Wallets

Value
- PPI M-wallet 18%
- PPI card 4%
- Debit card 14%
- Credit card 8%
- UPI P2M 56%

Volume
- PPI M-wallet 7%
- PPI card 2%
- Debit card 23%
- Credit card 28%
- UPI P2M 41%

**Summary:**
In December 2021, credit cards had the highest average ticket size at ₹4122, followed by UPI P2P at ₹2650. UPI P2M dominated the value and volume of transactions with 56% and 41% respectively. Other payment methods like prepaid cards, debit cards, and mobile wallets had lower average ticket sizes, with varying shares in value and volume.


In [35]:
import fitz  # PyMuPDF wrapper for simplicity
import os
def extract_images_and_text_from_pdf(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize a variable to store the combined text
    combined_text = ""

    # Loop through each page
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        text = page.get_text()

        # Add the text of the current page to combined_text
        combined_text += f"\n\nPage {page_number + 1}:\n{text}"

        # Get the images from the page
        image_list = page.get_images(full=True)

        # Extract and process each image
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_number+1}_img_{img_index+1}.{image_ext}"
            image_filepath = os.path.join(output_folder, image_filename)

            # Save the image to the output folder
            with open(image_filepath, "wb") as image_file:
                image_file.write(image_bytes)

            # Encode the image to base64
            base64_image = encode_image(image_filepath)

            # Use GPT-4o to describe the image and extract text
            image_description = describe_image(base64_image)

            # Add the image description and reference to combined_text
            combined_text += f"\n\n[Image: {image_filename}]\n{image_description}"

            print(f"Processed {image_filename} on page {page_number + 1}")

    print("Processing complete.")

    # Return the combined text
    return combined_text

# Example usage
pdf_path = "PA - Consolidated lecture notes.pdf"
output_folder = "extracted_images_new"
combined_text = extract_images_and_text_from_pdf(pdf_path, output_folder)

# Optionally save the combined text to a file
with open("combined_text.txt", "w") as text_file:
    text_file.write(combined_text)

Processed page_4_img_1.png on page 4
Processed page_5_img_1.png on page 5
Processed page_6_img_1.png on page 6
Processed page_6_img_2.png on page 6
Processed page_10_img_1.png on page 10
Processed page_11_img_1.png on page 11
Processed page_14_img_1.png on page 14
Processed page_16_img_1.png on page 16
Processed page_17_img_1.png on page 17
Processed page_19_img_1.png on page 19
Processed page_19_img_2.png on page 19
Processed page_20_img_1.png on page 20
Processed page_24_img_1.png on page 24
Processed page_25_img_1.png on page 25
Processed page_25_img_2.png on page 25
Processed page_25_img_3.png on page 25
Processed page_27_img_1.png on page 27
Processed page_29_img_1.png on page 29
Processed page_30_img_1.png on page 30
Processed page_32_img_1.png on page 32
Processing complete.


In [36]:
from langchain_community.document_loaders import TextLoader

loaders = TextLoader("combined_text.txt")

In [37]:
import pprint
pprint.pprint(loaders.load())


[Document(metadata={'source': 'combined_text.txt'}, page_content='\n\nPage 1:\nProduct Sense -\nHow to tackle product strategy and business acumen rounds in\ninterviews?\nLecture Objective:\nHow to address business acumen questions round :\n●\nAnalyzing a metric change. What’s a metric?\n●\nDefining metrics to measure performance / success of a new feature / product.\nImportance of product strategy & business acumen rounds -\n●\nApart from building models, developing dashboards and reporting frameworks -\nOne of the main responsibilities of a data scientist is to extract insights from\ndata and work with product managers and engineering teams to deliver\nactionable plans to improve the product.\n●\nProduct sense is about understanding all possibilities, not finding one correct\nanswer.\nExample Questions: Product Acumen / Business Acumen\n●\nWhy did Youtube’s traffic drop by 5%?\n●\nHow would you measure the success of the “Save Post” feature on Facebook?\n●\nWhat metrics would you def

In [38]:
loaders.load()[0].page_content

'\n\nPage 1:\nProduct Sense -\nHow to tackle product strategy and business acumen rounds in\ninterviews?\nLecture Objective:\nHow to address business acumen questions round :\n●\nAnalyzing a metric change. What’s a metric?\n●\nDefining metrics to measure performance / success of a new feature / product.\nImportance of product strategy & business acumen rounds -\n●\nApart from building models, developing dashboards and reporting frameworks -\nOne of the main responsibilities of a data scientist is to extract insights from\ndata and work with product managers and engineering teams to deliver\nactionable plans to improve the product.\n●\nProduct sense is about understanding all possibilities, not finding one correct\nanswer.\nExample Questions: Product Acumen / Business Acumen\n●\nWhy did Youtube’s traffic drop by 5%?\n●\nHow would you measure the success of the “Save Post” feature on Facebook?\n●\nWhat metrics would you define to measure the health of the product search in\nAmazon?\n●\nW

In [39]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 60,
    separators=["\n\n","\n"]
)

In [40]:
splits = text_splitter.split_documents(loaders.load())

In [41]:
len(splits)

131

In [42]:
from langchain.embeddings.openai import OpenAIEmbeddings
import tiktoken
embedding = OpenAIEmbeddings(api_key=os.getenv("openai_apikey"))

In [43]:

from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(splits, embedding)
print(db.index.ntotal)

131


In [44]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


llm = ChatOpenAI(api_key=os.getenv("openai_apikey"), model_name='gpt-4o-mini', temperature=0)

In [45]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer and dont find it in the given context, just say that you don't know , don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [46]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [47]:
result = qa_chain({"query": "average size of payment transactions analysed for december 2021"})
print(result['result'])

The average ticket sizes of payment transactions analyzed for December 2021 are as follows:

- Credit cards: ₹4122 and ₹1804
- Prepaid cards: ₹421
- UPI P2P: ₹2650
- UPI P2M: ₹786
- Mobile wallets: ₹375


In [None]:
import re
import os
import cv2
import matplotlib.pyplot as plt


# Function to extract image references from the text
def extract_image_references(text):
    pattern = r"\[Image:\s*(.*?)\]"
    image_references = re.findall(pattern, text)
    return image_references

# Function to display an image using Matplotlib
def display_image(image_path):
    # Check if the image file exists
    if os.path.exists(image_path):
        # Load and display the image
        image = cv2.imread(image_path)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(image_rgb)
        plt.axis('off')  # Hide axis
        plt.show()
    else:
        print(f"Image file {image_path} not found.")

# Example usage
image_references = extract_image_references(result["source_documents"][0].page_content)
print("Extracted image references:", image_references)

# Assume images are stored in "output/images"
image_folder = "extracted_images_new"

# Display each referenced image
for image_file in image_references:
    image_path = os.path.join(image_folder, image_file)
    print(f"Displaying {image_file}...")
    display_image(image_path)


In [None]:
result = qa_chain({"query": "explain product metric pyramid"})
print(result['result'])

ret_text=""
for doc in result["source_documents"]:
  ret_text=ret_text+doc.page_content

image_references = extract_image_references(ret_text)
print("Extracted image references:", image_references)

for image_file in image_references:
    image_path = os.path.join(image_folder, image_file)
    print(f"Displaying {image_file}...")
    display_image(image_path)