<a href="https://colab.research.google.com/github/manrajc13/Byte_chase/blob/main/FlaskApp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PDF PARSING**



In [1]:
!pip install huggingface-hub
!pip install transformers
!pip install torch
!pip install requests
!pip install pypdf paddlepaddle Pymupdf paddleocr langchain_community

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting Pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting paddleocr
  Downloading paddleocr-2.9.1-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.

In [2]:
import os
import cv2
import numpy as np
import pymupdf as fitz  # PyMuPDF for rendering PDF pages as images
from paddleocr import PaddleOCR
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document

In [3]:
# Initializing PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Function to preprocess images incase of any present in the pdf
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    processed_image = cv2.adaptiveThreshold(
        image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    return processed_image

# Function to extract text from an image using PaddleOCR
def ocr_image(image_path):
    # Preprocess the image
    processed_image = preprocess_image(image_path)
    processed_image_path = image_path.replace(".png", "_processed.png")
    cv2.imwrite(processed_image_path, processed_image)

    # Perform OCR
    ocr_results = ocr.ocr(processed_image_path, cls=True)
     # Check if ocr_results is empty or None
    if ocr_results and ocr_results[0]:
        extracted_text = " ".join([line[1][0] for line in ocr_results[0]])  # Combine text lines
    else:
        extracted_text = ""  # Return an empty string if no text is detected

    return extracted_text

# Main function for file processing
def file_processing(file, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    # Use PyPDFLoader to load text content
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)

    final_texts = ""

    # Process each page of the PDF
    doc = fitz.open(file)  # Open the PDF file with PyMuPDF
    for idx, page in enumerate(doc):
        # print(f"Processing page {idx + 1}...")
        if idx < len(pages):
            page_content = pages[idx].page_content
        else:
            page_content = ""
        if not page_content.strip():
            output_image_path = os.path.join(output_dir, f"page_{idx + 1}.png")
            pix = page.get_pixmap()
            pix.save(output_image_path)

            ocr_text = ocr_image(output_image_path)
            # print(f"OCR extracted text: {ocr_text}")
            page_content = ocr_text

        final_texts += page_content + "\n"


    document = Document(page_content=final_texts)
    chunks = text_splitter.split_documents([document])

    return final_texts

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:08<00:00, 451.44it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:17<00:00, 562.94it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:15<00:00, 139.37it/s]

[2025/01/10 13:06:27] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




In [147]:
!pip install groq

Collecting groq
  Downloading groq-0.14.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.14.0-py3-none-any.whl (109 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/109.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m102.4/109.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.5/109.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.14.0


In [198]:
from groq import Groq
from google.colab import userdata
groq_api_key = userdata.get('groq_api_key')
client = Groq(
    api_key=groq_api_key,
) # retrieving the api key

In [162]:
def get_messages(text): # get user and system prompt in formatted manner as supported by the model
  system_msg = """
 You are a skilled assistant specializing in generating unified, complete, and structured cheat sheets from multiple sources. Your task is to take the provided content and create a comprehensive, detailed, and brief cheat sheet following the criteria and formatting guidelines below:
 Additionally, please do not use any pefixes such as greetings, disclaimers, or setup text and suffixes such as closing statements, redundant information, or reminders.
  Focus Areas:
    1) Fundamental Concepts and Overview:
      * Summarize the core ideas and principles of the topic.
      * Provide a clear, concise introduction to set the context.

    2) Key Mathematical Concepts:
      * Include any relevant mathematical ideas or theories
      * If no mathematical concepts are present, skip this section and don't even mention it.

    3) Code snippets and Explanation:
      * If the topic involves programming, include essential code examples.
      * Provide a brief explanation of what the code does and its significance.
      * You may use your own domain knowledge to generate code of the topics given.
      * Skip this section if no code is relevant.

    4) Critical Defintions:
      * Include any terms or concepts that are vital to understanding this topic
      * Ensure definitons are concise and easy to understand.

    5) Important Formulas and Equations:
      * List key formulas and equations.
      * Use proper mathematical notation for clarity.
      * Id no such thing is present, skip this section and don't even mention it.

    6) Algorithms and Key Steps:
      * Describe any relevant algorithms.
      * Focus on outlining their key steps.

  Formatting Guidelines:
    1) Heading:
      * Heading of the cheatsheet should be bold and large.

    2) Additional Text:
      * Apart from the formatted cheat sheet you must not write any other text such as "Here is the cheat sheet" or that "Note: The cheat sheet is within the 500-word limit and has a clear, organized structure with concise explanations and examples."

    3) Organized Sections:
      * Use clear headings
      * Each section should have a logical flow.

    4) Mathematical Notation:
      * Use proper Markdown syntax for formulas (e.g., $E = mc^2$ for inline equations).

    5) Conciseness and Clarity:
      * Keep explanations brief but thorough.
      * Avoid unnecessary repitition.

  Output Requirements:
    1) Write the cheat sheet in markdown format.
    2) Ensure the final output should be about 800 words.
  """
  user_msg = f"Please summarize the following text:\n{text}"
  messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
  ]
  return messages


def generate_cheat_sheet(text):
  messages = get_messages(text)
  response = client.chat.completions.create(
    messages = messages,
    model="llama-3.1-8b-instant"
  )
  return response.choices[0].message.content

**MCQ** **Generation**

In [175]:
!pip install keybert # installing keybert library for keyword extraction from cheat sheet

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5


In [179]:
# importing libraries for sent tokenize and keyword extraction
from keybert import KeyBERT
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import re

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [180]:
# to perform sentence tokenize (dividing generated cheat sheet into passages) and perform keyword extraction

'''
  Extracted keywords will be used to prompt the model to generate topic specific questions so that it doesn't deviate from the actual flow of concept

'''

def extract_keywords_keybert(text, top_n = 20):
  """
  Extracts keywords from a text using KeyBERT

  Args:
  - text: str, the input text
  - top_n: int, the number of keywords to extract

  Returns:
  - List of keywords
  """
  kw_model = KeyBERT()
  keywords = kw_model.extract_keywords(text, keyphrase_ngram_range = (1, 2), stop_words = 'english', top_n = top_n) # here (1, 2) means single words and pair of words
  return [keyword[0] for keyword in keywords]


def get_passages(text):
  # split text into sentences
  sentences = sent_tokenize(text)

  # combine sentences into passages
  passages = []
  current_passage = ""
  total_words = sum([len(sentence.split()) for sentence in sentences])
  for sentence in sentences:
    if len(current_passage.split()) + len(sentence.split()) < int(total_words/10):  # if word limit of passage goes beyond 1/10th the length then it appends the current passage and starts a new one
        current_passage += " " + sentence
    else:
        passages.append(current_passage.strip())
        current_passage = sentence
  if current_passage:
      passages.append(current_passage.strip())

  return passages


In [197]:
# generating questions
def get_message2(keywords, passage):
  system_prompt = """
    You are a helpful assistant. You have deeo knowledge of computer science topics and are able to generate multiple choice questions based on a list of keywords and a given passage.
    Use your own domain knowledge of the subject on the list of keywords to generate only one single multiple choice question. The question should:
    - Be informative, testing foundational and deep knowledge
    - Include clear and concise questions with four options (a, b, c, d).
    - Ensure the questions are challenging and diverse in their difficulty.
  """

  system_prompt += """
      Additionally, the questions generated should adhere to the following format which is a python dictionary
      {"ques": "Question generated", "a": "option a", "b": "option b", "c": "option c", "d": "option d", "ans": "correct option"}
      Apart from this you need not write anything before or after it.
  """
  system_prompt += """
    For example, you may refer to the following:
    {"ques":"Why does Dijkstra's algorithm fail to work correctly with negative edge weights?", "a":"Dijkstra's algorithm is designed to find the shortest path in unweighted graphs only.",
    "b":"Negative edge weights can cause the algorithm to update distances incorrectly, leading to incorrect shortest paths.", "c":"The algorithm's priority queue cannot handle negative values."
    "d":"Negative edge weights result in an infinite loop due to cyclic paths", "ans":"b"}
  """

  user_prompt = f"""
  Please generate the a multiple choice question (MCQ) with the help of the keywords {keywords} and you may refer to the following passage for context \n {passage}

  """
  messages = [
      {"role":"system", "content":system_prompt},
      {"role":"user", "content":user_prompt}
  ]
  return messages


def generate_mcq(keywords, passage):
  messages = get_message2(keywords, passage)
  response = client.chat.completions.create(
    messages = messages,
    model="llama-3.3-70b-versatile"  # using llama 3.3 70B model's inference to generate mcq
  )
  return response.choices[0].message.content

def get_all_ques(input_text):
  passages = get_passages(input_text)
  all_ques = []
  for passage in passages:
    keywords = extract_keywords_keybert(passage)
    ques = generate_mcq(keywords, passage)
    all_ques.append(ques)
  return all_ques

def Eval(text):  # function to convert string into a valid dictionary
  txt = ""
  flag = False
  for i in range(len(text)):
    char = text[i]
    if char == "{":
      flag = True
      txt += char
    elif flag:
      txt += char
  return eval(txt)


def get_list_of_ques(questions):
  ques = []
  for ele in questions:
    modified_ele = Eval(ele)
    ques.append(modified_ele)
  return ques[:10]

Flask App

In [188]:
!pip install flask flask-ngrok requests pyngrok flask_cors



In [189]:
# importing necessary libraries
from flask import Flask, request, jsonify
from pyngrok import ngrok
import requests

In [190]:
auth_token = userdata.get('ngrok_authToken')
ngrok.set_auth_token(auth_token) # retrieving token for ngrok

In [196]:
from flask_cors import CORS

app = Flask(__name__) # instantiating flask app

# Enable CORS for all routes or specific origins
CORS(app)
# Ensure the static directory exists
os.makedirs('static', exist_ok=True)



@app.route('/', methods=['POST']) # post request
def process_file():
  try:
      data = request.get_json()
      file_url = data.get("fileurl")
      if not file_url:
          return jsonify({"error": "No file URL provided"}), 400

      # Download the file
      response = requests.get(file_url)
      file_name = "downloaded_file.pdf"
      with open(file_name, "wb") as file:
          file.write(response.content)

      # Process the file
      output_dir = r"/content/sample_data/OutputPDF"
      pdf_text = file_processing(file_name,output_dir) # pdf parsed-text

      # Generate cheat sheet
      cheat_sheet = generate_cheat_sheet(pdf_text)
      questions = get_all_ques(cheat_sheet) # get all questions
      ques = get_list_of_ques(questions) # get list of dictionaries
      return jsonify({"cheat_sheet": cheat_sheet, "questions": ques}),200 # return json object to frontend
  except Exception as e:
      return jsonify({"error": str(e)}), 500




if __name__ == '__main__':

  # Open a public URL for the Flask app
  public_url = ngrok.connect(5000)
  print(f"Public URL: {public_url}")

  # Run the Flask app
  app.run(port=5000)


Public URL: NgrokTunnel: "https://04ca-35-185-21-162.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
