In [2]:
!pip install easyocr pdf2image sentence-transformers faiss-cpu transformers




In [8]:
!pip install PyMuPDF
!pip install easyocr
!pip install numpy
!pip install sentence-transformers
!pip install transformers



Collecting PyMuPDF
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.4 PyMuPDFb-1.24.3


In [18]:
import easyocr
import numpy as np
from pdf2image import convert_from_path
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
import json
import fitz  # PyMuPDF

# Step 1: Extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file_path):
    # Open the PDF file
    with fitz.open(pdf_file_path) as pdf_document:
        text = ""
        # Iterate through each page
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            # Extract text from the page
            text += page.get_text()
    return text

# Step 2: Convert PDF pages to images and extract text using EasyOCR
def extract_text_from_pdf_with_easyocr(pdf_file_path):
    # Convert PDF to a list of images
    pages = convert_from_path(pdf_file_path, 300)  # 300 DPI is typical for good quality

    # Initialize EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Initialize an empty list to store the extracted text
    extracted_text = []

    # Process each page in the PDF
    for page in pages:
        # Convert PIL image to numpy array (needed for EasyOCR)
        image = np.array(page)

        # Use EasyOCR to extract text from the image
        results = reader.readtext(image)

        # Extracted text from the page
        page_text = ""
        for result in results:
            page_text += " " + result[1]  # Extract the text from the block

        # Append the text to the list
        extracted_text.append(page_text)

    # Combine text from all pages into a single string
    full_text = " ".join(extracted_text)
    return full_text

# Step 3: Embed the text using a local transformer model
def embed_text(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Split the text into chunks of 1000 characters
    chunk_size = 1000
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    # Generate embeddings for each chunk
    embeddings = model.encode(chunks)
    return embeddings

# Step 4: Use a local language model for the QA task
def answer_questions(queries, text_embeddings):
    # Initialize the QA pipeline
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

    # Process each query
    responses = []
    for query in queries:
        # Use the QA pipeline to find answers
        answer = qa_pipeline(question=query, context=text_embeddings)
        responses.append(answer['answer'])

    return responses

# Define the path to your PDF file
pdf_file_path = '/content/2012.07634.pdf'

# Extract text from the PDF using PyMuPDF
pdf_text = extract_text_from_pdf(pdf_file_path)

# Alternatively, you can extract text using EasyOCR
# pdf_text = extract_text_from_pdf_with_easyocr(pdf_file_path)

# Embed the text using SentenceTransformer
text_embeddings = embed_text(pdf_text)

# Define the queries
queries = [
    "how many blocks does the FFNN model consists of?",
    "The github repoisitory is licensed under?"

]

# Answer the questions
responses = answer_questions(queries, pdf_text)

# Print the responses
for query, response in zip(queries, responses):
    print(f"Q: {query}\nA: {response}\n")


Q: how many blocks does the FFNN model consists of?
A: three

Q: The github repoisitory is licensed under?
A: Apache 2.0

