<a href="https://colab.research.google.com/github/malaya-01/Books/blob/main/opencv_with_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Google Colab Setup: Install necessary packages
!pip install pymupdf Pillow pytesseract python-dotenv langchain langchain-google-genai langchain-community faiss-cpu
!apt-get install tesseract-ocr

import fitz  # PyMuPDF
import io
import os
from PIL import Image, ImageFont, ImageDraw
import pytesseract
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import create_retrieval_chain

# Load environment variables
load_dotenv()

from langchain_core.messages import HumanMessage, AIMessage

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 3s (1,861 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 121925 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-



In [None]:


# Initialize Tesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # This should work in Colab after installing tesseract

from google.colab import files

# Function to extract text from PDFs
def extract_text_from_pdfs(pdf_files):
    texts = []
    for pdf in pdf_files:
        with fitz.open(pdf) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            texts.append(text)
    return '\n\n'.join(texts)

# Function to convert text to image and extract using Tesseract
def text_to_image_and_extract(combined_text):
    lines = combined_text.split('\n')
    width, height = 1000, 20 * (len(lines) + 1)
    im = Image.new('RGB', (width, height), color=(255, 255, 255))
    d = ImageDraw.Draw(im)
    font = ImageFont.load_default()
    y = 10
    for line in lines:
        d.text((10, y), line, font=font, fill=(0, 0, 0))
        y += 20
    del d
    buffer = io.BytesIO()
    im.save(buffer, format="JPEG")
    buffer.seek(0)
    return pytesseract.image_to_string(Image.open(buffer)), buffer

# Function to handle PDF/text file extraction and processing
def handle_pdf_text_extraction(pdf_files):
    combined_text = extract_text_from_pdfs(pdf_files)
    extracted_text, buffer = text_to_image_and_extract(combined_text)

    output_format = input("Which format? ['pdf' or 'txt']: ")
    if output_format == 'pdf':
        pdf_output = fitz.open()
        page = pdf_output.new_page(width=1000, height=20 * (len(combined_text.split('\n')) + 1))
        img = fitz.Pixmap(buffer)
        page.insert_image(page.rect, pixmap=img)
        pdf_output.save('output.pdf')
        print("Saved extracted text to output.pdf")
    elif output_format == 'txt':
        with open('output.txt', 'w', encoding='utf-8') as f:
            f.write(extracted_text)
        print("Saved extracted text to output.txt")

    return extracted_text

# Function to get documents from a local file
def get_documents_from_file(file_path):
    loader = TextLoader(file_path, encoding='utf-8')
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=20
    )
    split_docs = splitter.split_documents(docs)
    return split_docs

# Function to create vector database
def create_vector_db(docs):
    api_key = input("Please enter your Google API key: ")
    os.environ["GOOGLE_API_KEY"] = api_key
    embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001', google_api_key=api_key)
    vector_store = FAISS.from_documents(docs, embedding=embeddings)
    return vector_store

# Function to create chat chain
def create_chain(vector_store):
    model = ChatGoogleGenerativeAI(
        model='gemini-1.5-flash',
        temperature=0.4
    )

    prompt = ChatPromptTemplate.from_template(
        """
        Answer the user's question:
        context: {context}
        chat_history: {chat_history}
        Question: {input}
        """
    )

    chain = create_stuff_documents_chain(
        llm=model,
        prompt=prompt,
        output_parser=StrOutputParser()
    )

    retriever = vector_store.as_retriever(search_kwargs={"k": 1})

    retrieval_chain = create_retrieval_chain(
        retriever,
        chain
    )

    return retrieval_chain

# Function to process chat
def process_chat(chain, question, chat_history):
    response = chain.invoke({
        "input": question,
        "chat_history": chat_history
    })

    return response['answer']

# Main function
if __name__ == '__main__':
    # Part 1: Handle PDF/Text extraction
    print("Please upload the required PDF files.")
    uploaded = files.upload()
    pdf_files = list(uploaded.keys())

    extracted_text = handle_pdf_text_extraction(pdf_files)

    # Save extracted text to a temporary file for document processing
    temp_file = 'temp.txt'
    with open(temp_file, 'w', encoding='utf-8') as f:
        f.write(extracted_text)

    # Part 2: Document processing and chat handling
    docs = get_documents_from_file(temp_file)
    vector_store = create_vector_db(docs)
    chain = create_chain(vector_store)

    chat_history = []

    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            break
        response = process_chat(chain, user_input, chat_history)
        chat_history.append(HumanMessage(content=user_input))
        chat_history.append(AIMessage(content=response))

        print("Assistant: ", response)


Please upload the required PDF files.


Saving solar (1).pdf to solar (1).pdf
Which format? ['pdf' or 'txt']: txt
Saved extracted text to output.txt
Please enter your Google API key: AIzaSyAAQkDjqwg7cyade6bCBa27x0cZfpAwK4w
You: what is the solar system
Assistant:  The Solar System is a gravitationally bound system that includes the Sun and all the objects that orbit it, either directly or indirectly. The main objects in the Solar System are the eight planets, which orbit the Sun directly. 

These planets are divided into two groups:

* **Inner terrestrial planets:** Mercury, Venus, Earth, and Mars. These planets are smaller, rocky, and closer to the Sun.
* **Outer gas giants:** Jupiter, Saturn, Uranus, and Neptune. These planets are much larger, made mostly of gas, and further from the Sun. 

You: tell me more about the solar system
Assistant:  The Solar System is a fascinating place!  In addition to the eight planets, it also contains:

* **Dwarf planets:** Like Pluto, these are smaller than the main planets and haven't cle