<a href="https://colab.research.google.com/github/manrajc13/Byte_chase/blob/main/FlaskApp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing the necessary Libraries for pdf parsing and creating flask App.

In [None]:
!pip install pypdf paddlepaddle Pymupdf paddleocr langchain_community flask flask-ngrok requests pyngrok flask_cors

Collecting flask_cors
  Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: flask_cors
Successfully installed flask_cors-5.0.0


**Loading the unsloth Llama 3.1-8B-Instruct 4 bit quantized for cheat sheet and question generation**

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage and improve efficiency of loading of models

# creating tokenizer and model

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # this model has been fine tuned for instruction tasks
    max_seq_length = max_seq_length, # max sequence length defines maximum number of input tokens permissible
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.1: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [None]:
FastLanguageModel.for_inference(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSN

Importing necessary libraries for pdf parsing and ocr

In [None]:
import os
import cv2
import numpy as np
import pymupdf as fitz  # PyMuPDF for rendering PDF pages as images
from paddleocr import PaddleOCR
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document

PDF Parsing

In [None]:
# Initializing PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Function to preprocess images incase of any present in the pdf
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    processed_image = cv2.adaptiveThreshold(
        image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    return processed_image

# Function to extract text from an image using PaddleOCR
def ocr_image(image_path):
    # Preprocess the image
    processed_image = preprocess_image(image_path)
    processed_image_path = image_path.replace(".png", "_processed.png")
    cv2.imwrite(processed_image_path, processed_image)

    # Perform OCR
    ocr_results = ocr.ocr(processed_image_path, cls=True)
     # Check if ocr_results is empty or None
    if ocr_results and ocr_results[0]:
        extracted_text = " ".join([line[1][0] for line in ocr_results[0]])  # Combine text lines
    else:
        extracted_text = ""  # Return an empty string if no text is detected

    return extracted_text

# Main function for file processing
def file_processing(file, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    # Use PyPDFLoader to load text content
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)

    final_texts = ""

    # Process each page of the PDF
    doc = fitz.open(file)  # Open the PDF file with PyMuPDF
    for idx, page in enumerate(doc):
        # print(f"Processing page {idx + 1}...")
        if idx < len(pages):
            page_content = pages[idx].page_content
        else:
            page_content = ""
        if not page_content.strip():
            output_image_path = os.path.join(output_dir, f"page_{idx + 1}.png")
            pix = page.get_pixmap()
            pix.save(output_image_path)

            ocr_text = ocr_image(output_image_path)
            # print(f"OCR extracted text: {ocr_text}")
            page_content = ocr_text

        final_texts += page_content + "\n"


    document = Document(page_content=final_texts)
    chunks = text_splitter.split_documents([document])

    return final_texts

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:03<00:00, 1114.43it/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:20<00:00, 498.38it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:06<00:00, 320.47it/s]

[2025/01/08 06:14:09] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




**Cheat Sheet Generation**

In [None]:
def generate_cheat_sheet(text):
  messages = get_messages(text)
  inputs = tokenizer.apply_chat_template(
    messages, # user and system prompt type
    return_tensors="pt",  # Return as PyTorch tensors
    truncation=True,  # Enable truncation if the tokenized input exceeds model's max sequence length
    padding=True  # Enable padding if the tokenized input is shorter than the max length
  ).to("cuda")
  max_token_to_be = 1024

  outputs = model.generate(inputs, max_new_tokens = max_token_to_be, use_cache = True)
  response = tokenizer.decode(outputs[0][len(inputs[0]) + 3:], skip_special_tokens = True)
  return response

def get_messages(text):
  system_prompt = """
  Create a comprehensive cheat sheet of about 800 words from the following technical text.
  Focus on:
  - Fundamental concepts
  - Key mathematical concepts if any otherwise don't mention it
  - Code snippets if any
  - Critical definitions
  - Important formulas and equations
  - Algorithms and their key steps
  - Fundamental principles

  Formatting Guidelines:
  - Use clear, concise sections
  - Include mathematical notation
  - Provide brief explanations
  - Highlight practical applications
  - Generate in Markdown
  """

  user_prompt = f"""
  Please summarize the following text \n {text}
  """

  messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
  ]
  return messages

**MCQ Generation**

In [None]:
!pip install keybert # installing keybert library for keyword extraction from cheat sheet

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5


In [None]:
# importing libraries for sent tokenize and keyword extraction
from keybert import KeyBERT
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import re

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# to perform sentence tokenize and keyword extraction

def extract_keywords_keybert(text, top_n = 20):
  """
  Extracts keywords from a text using KeyBERT

  Args:
  - text: str, the input text
  - top_n: int, the number of keywords to extract

  Returns:
  - List of keywords
  """
  kw_model = KeyBERT()
  keywords = kw_model.extract_keywords(text, keyphrase_ngram_range = (1, 2), stop_words = 'english', top_n = top_n)
  return [keyword[0] for keyword in keywords]


def get_passages(text):
  # split text into sentences
  sentences = sent_tokenize(text)

  # combine sentences into passages
  passages = []
  current_passage = ""
  total_words = sum([len(sentence.split()) for sentence in sentences])
  for sentence in sentences:
    if len(current_passage.split()) + len(sentence.split()) < int(total_words/10):  # if word limit of passage goes beyond 200 then it appends the current passage and starts a new one
        current_passage += " " + sentence
    else:
        passages.append(current_passage.strip())
        current_passage = sentence
  if current_passage:
      passages.append(current_passage.strip())

  return passages


def get_cleaned_text(text):
  cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', text)
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
  return cleaned_text

In [None]:
# Questions generation
def get_message2(keywords, passage):
  system_prompt = """
    You are a helpful assistant. You have deeo knowledge of computer science topics and are able to generate multiple choice questions based on a list of keywords and a given passage.
    Use your own domain knowledge of the subject on the list of keywords to generate only one single multiple choice question. The question should:
    - Be informative, testing foundational and deep knowledge
    - Include clear and concise questions with four options (a, b, c, d).
    - Ensure the questions are challenging and diverse in their difficulty.
  """

  system_prompt += """
      Additionally, the questions generated should adhere to the following format which is a python dictionary
      {'ques': 'Question generated', 'a': 'option a', 'b': 'option b', 'c': 'option c', 'd': 'option d', 'ans': 'correct option'}
      Apart from this you need not write anything before or after it.
  """
  system_prompt += """
    For example, you may refer to the following:
    {'ques':'Why does Dijkstra's algorithm fail to work correctly with negative edge weights?', 'a':'Dijkstra's algorithm is designed to find the shortest path in unweighted graphs only.',
    'b':'Negative edge weights can cause the algorithm to update distances incorrectly, leading to incorrect shortest paths.', 'c':'The algorithm's priority queue cannot handle negative values.'
    'd':'Negative edge weights result in an infinite loop due to cyclic paths', 'ans':'b'}
  """

  user_prompt = f"""
  Please generate the a multiple choice question (MCQ) with the help of the keywords {keywords} and you may refer to the following passage for context \n {passage}

  """
  messages = [
      {"role":"system", "content":system_prompt},
      {"role":"user", "content":user_prompt}
  ]
  return messages


def generate_mcq(keywords, passage):
  messages = get_message2(keywords, passage)
  inputs = tokenizer.apply_chat_template(messages, return_tensors = "pt").to("cuda")
  outputs = model.generate(inputs, max_new_tokens = 1024, use_cache = True)
  response = tokenizer.decode(outputs[0][len(inputs[0]) + 3:], skip_special_tokens = True)
  return response

def get_all_ques(input_text):
  passages = get_passages(input_text)
  all_ques = []
  for passage in passages:
    keywords = extract_keywords_keybert(passage)
    ques = generate_mcq(keywords, passage)
    all_ques.append(ques)
  return all_ques

def Eval(text):
  txt = ""
  flag = False
  for i in range(len(text)):
    char = text[i]
    if char == "{":
      flag = True
      txt += char
    elif flag:
      txt += char
  return eval(txt)


def get_list_of_ques(questions):
  ques = []
  for ele in questions:
    modified_ele = Eval(ele)
    ques.append(modified_ele)
  return ques


**Creating Flask App and Using ngrok to connect to frontend**

In [None]:
# installing necessary libraries
from flask import Flask, request, jsonify
from pyngrok import ngrok
import requests

In [None]:
# getting authtoken

from google.colab import userdata
auth_token = userdata.get('ngrok_authToken')

In [None]:
ngrok.set_auth_token(auth_token)

In [None]:
from flask_cors import CORS

app = Flask(__name__)

# Enable CORS for all routes or specific origins
CORS(app)
# Ensure the static directory exists
os.makedirs('static', exist_ok=True)



@app.route('/', methods=['POST']) # post request
def process_file():
  try:
      data = request.get_json()
      file_url = data.get("fileurl")
      if not file_url:
          return jsonify({"error": "No file URL provided"}), 400

      # Download the file
      response = requests.get(file_url)
      file_name = "downloaded_file.pdf"
      with open(file_name, "wb") as file:
          file.write(response.content)

      # Process the file
      output_dir = r"/content/sample_data/OutputPDF"
      pdf_text = file_processing(file_name,output_dir) # pdf parsed-text

      # Generate cheat sheet
      cheat_sheet = generate_cheat_sheet(pdf_text)
      questions = get_all_ques(cheat_sheet) # get all questions
      ques = get_list_of_ques(questions) # get list of dictionaries
      return jsonify({"cheat_sheet": cheat_sheet, "questions": ques}),200 # return json object to frontend
  except Exception as e:
      return jsonify({"error": str(e)}), 500




if __name__ == '__main__':

  # Open a public URL for the Flask app
  public_url = ngrok.connect(5000)
  print(f"Public URL: {public_url}")

  # Run the Flask app
  app.run(port=5000)


Public URL: NgrokTunnel: "https://7dda-34-91-68-172.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
