# Libraries

In [None]:
!pip install PyPDF2                 # to manipulate PDF documents by splitting, merging, cropping
!pip install pycryptodome           # for encryption, decryption, hashing
!pip install PyMuPDF                # PDF viewer -  extract text, and manipulate PDF content
!pip install pdfminer.six           # for extracting text, images, and metadata from PDF files
!pip install pdf2image              # converts PDF files into a sequence of images
!apt-get install -y tesseract-ocr
!pip install pytesseract            # Python wrapper for Google's Tesseract-OCR Engine
!pip install gTTS                   # convert text into speech using Google's voice synthesis
!pip install poppler-utils          # includes tools for converting PDFs to different formats (PDF to PNG)
!pip install transformers           # simplifying the implementation of NLP tasks by offering pre-trained models and tools
!pip install transformers sentencepiece sacremoses

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pycryptodome
  Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycryptodome
Successfully installed pycryptodome-3.20.0
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# creation, reading, and extraction of ZIP archive
from zipfile import ZipFile

# for extracting text, images, and metadata from PDF files
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

from io import StringIO

# module for encoding and decoding binary data using base64 encoding.
import base64

#------- OCR ------------
import pdf2image
import pytesseract
import fitz   # module in PyMuPDF
import os     # to interact with the os, including functions for file manipulation and directory operations
import glob   # for pattern matching files and directories

from pdf2image import convert_from_path
from pytesseract import Output, TesseractError
# from tkinter import Tk, Frame, Button, filedialog
from PyPDF2 import PdfReader
from PIL import Image         # Pillow library, working with images, including opening, manipulating, and saving images in various formats
from google.colab import files
from gtts import gTTS
from IPython.display import Audio
from transformers import pipeline
from transformers import *

import tensorflow as tf
from tensorflow.keras import datasets,layers,models
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import cv2
from keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16

from sklearn.model_selection import train_test_split

import random

from google.colab import drive
drive.mount('/content/drive')

CLASSES_LIST=["aadhar","passport","driver license","pan","voter"]
IMG_SIZE=224

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Mounted at /content/drive


# OCR engine - Tesseract for png file

In [None]:
def png_to_text():
    folder_path = "/content/output_images"
    png_files = glob.glob(f"{folder_path}/*.png")
    extracted_text = {}

    for file_name in png_files:
        img = Image.open(file_name)
        text = pytesseract.image_to_string(img)
        extracted_text[file_name] = text

    return extracted_text

# Convert PDF to IMAGE


In [None]:
def convert_pdf_to_images(pdf_path, output_folder, menu_option):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_document = fitz.open(pdf_path)

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        image_matrix = page.get_pixmap()
        img = Image.frombytes("RGB", [image_matrix.width, image_matrix.height], image_matrix.samples)
        image_path = f"{output_folder}/page_{page_number + 1}.png"
        img.save(image_path, "PNG")

    pdf_document.close()
    print(f"PDF pages converted to images in the folder: {output_folder}")

    # if classification of document ; do not need text extraction
    if(menu_option == 3):
      return

    text = png_to_text()
    return text

In [None]:
def convert_image_to_text(image_path, output_folder, menu_option):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    img = Image.open(image_path)

    # Perform any necessary preprocessing or analysis here

    # Save the image as PNG
    image_filename = os.path.splitext(os.path.basename(image_path))[0]  # Get filename without extension
    output_image_path = os.path.join(output_folder, f"{image_filename}.png")
    img.save(output_image_path, "PNG")

    print(f"Image converted and saved as: {output_image_path}")

    if menu_option == 3:
        return None  # No need for text extraction if classification of document
    else:
        text = png_to_text()
        return text


# Uploading files

In [None]:
def upload_pdf_and_convert(menu_option):
    uploaded = files.upload()
    pdf_path = None
    image_path = None

    for name, data in uploaded.items():
        if name.endswith('.pdf'):
            with open(name, 'wb') as f:
                f.write(data)
                pdf_path = name
                print(f"Uploaded {name}")
        if name.lower().endswith(('.png', '.jpg', '.jpeg')):
            with open(name, 'wb') as f:
                f.write(data)
                image_path = name
                print(f"Uploaded {name}")

    if pdf_path:
        output_folder = 'output_images'
        text = convert_pdf_to_images(pdf_path, output_folder, menu_option)
        return text
    if image_path:
        output_folder = 'output_images'
        text = convert_image_to_text(image_path, output_folder, menu_option)
        return text
    else:
        print("\n-> Please upload a valid file.")


#Text to speech

In [None]:
def text_to_audio():
    print("Please upload a PDF file.")
    menu_option = 1
    extracted_text = upload_pdf_and_convert(menu_option)

    # Clean the extracted text by removing newline characters.
    cleaned_text = [text.replace('\n', ' ') for text in extracted_text.values()]
    print("Extracted text : ", cleaned_text)

    # Combine the cleaned text into a single string.
    combined_text = ' '.join(cleaned_text)

    # Using gTTS library to convert the combined text to speech.
    tts = gTTS(text=combined_text, lang='en')

    # Saved generated speech as MP3 file.
    tts.save('output.mp3')

    print("\n\n Audio has been created!\n\n")

    # Return the created audio file and autoplay it.
    return Audio('output.mp3', autoplay=True)

# Read Aloud Function


In [None]:
def click_read_aloud():

    print("\nRead Aloud option chosen\n")
    text_to_audio()


# Document Summary Function

In [None]:
def summarize_text(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
    return summary[0]['summary_text']

def summarize_extracted_text(extracted_text):
    summarized_text = {}
    for file_name, text in extracted_text.items():
        summarized_text[file_name] = summarize_text(text)
    return summarized_text

In [None]:
def click_document_summary():
    print("\nDocument Summary option chosen\n")
    print("-> Please upload a PDF file.")
    menu_option = 2
    extracted_text = upload_pdf_and_convert(menu_option)

    summarized_text = summarize_extracted_text(extracted_text)

    cleaned_text = [text.replace('\n', '') for text in extracted_text]
    print("\nExtracted text : ",cleaned_text)

    for file_name, summary in summarized_text.items():
      print(f"Summary of {file_name}:\n{summary}\n{'-'*50}")

# Document Classification Function

In [None]:
def classify_document_images(image_folder, model):
    # Iterate through each item in the folder
    for item in os.listdir(image_folder):
        item_path = os.path.join(image_folder, item)

        # Check if the item is a file
        if os.path.isfile(item_path):
            # Load and preprocess the image
            img = image.load_img(item_path, target_size=(IMG_SIZE, IMG_SIZE))
            img_array = image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)
            img_array /= 255.0  # Normalize the image

            # Make a prediction
            prediction = model.predict(img_array)
            predicted_class = np.argmax(prediction)
            predicted_class_name = CLASSES_LIST[predicted_class]

            print(f"Item: {item}, Predicted Class: {predicted_class_name}")

In [None]:
def click_document_classification():

    print("\nDocument Classification option chosen")
    menu_option = 3
    upload_pdf_and_convert(menu_option)

    model = load_model('finalmodel.h5')

    classify_images_folder = '/content/output_images'
    classify_document_images(classify_images_folder, model)

#Grammar Check

In [None]:
#from language_tool_python import LanguageTool
#from happytransformer import HappyTextToText, TTSettings
from transformers import pipeline

def click_check_grammar():
    print("\nGrammar Check option chosen\n")
    print("-> Please upload a PDF file.")
    menu_option = 2
    extracted_text = upload_pdf_and_convert(menu_option)
    text_full=""
    for text in extracted_text.values():
      text_full += text

    corrected_text = grammar_checker(text_full)
    #corrected_text = grammar_checker("This are buses and these is a car.")
    print("\nCorrected text : ",corrected_text)

def grammar_checker(text):
    corrector = pipeline(
              'text2text-generation',
              'pszemraj/flan-t5-large-grammar-synthesis',
              )
    #raw_text = 'i can has cheezburger'
    results = corrector(text)
    return results

#qna

In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

Collecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.1-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.5/308.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl (23 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.63-py3-none-any.whl (122 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.8/122.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.0->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting packaging<24.0,>=23.2 (from langcha

In [None]:
!pip install --upgrade langchain
!pip install faiss-cpu



In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.2-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensi

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
# from typing_extensions import Concatenate
# # read text from pdf
# menu_option = 5
# extracted_text = upload_pdf_and_convert(menu_option)
# raw_text=""
# for content in extracted_text.values():
#     raw_text += content if content else ""

Saving Application-3.pdf to Application-3 (1).pdf
Uploaded Application-3 (1).pdf
PDF pages converted to images in the folder: output_images


In [None]:
# We need to split the text using Character Text Split such that it sshould not increse token size
# text_splitter = CharacterTextSplitter(
#     separator = "\n",
#     chunk_size = 800,
#     chunk_overlap  = 200,
#     length_function = len,
# )
# texts = text_splitter.split_text(raw_text)

In [None]:
# embeddings = OpenAIEmbeddings()
# document_search = FAISS.from_texts(texts, embeddings)
# document_search
# from langchain.chains.question_answering import load_qa_chain
# from langchain.llms import OpenAI
# chain = load_qa_chain(OpenAI(), chain_type="stuff")
# query = input("enter your question, else type exit") #"explain the objective of the project"
# # while(True):
# #   query = input("enter your question, else type exit")
# #   if(query=="exit"):
# #     print("Thanks for exiting!")
# #     break
#   #answer = ask_question(text_full, question)
# docs = document_search.similarity_search(query)
# chain.run(input_documents=docs, question=query)

enter your question, else type exitexplain the objective of the project


" The main objective of the project is to expand the capabilities of the Document Assistant and make it into a more versatile and robust tool. This will be achieved by refining existing features and introducing new functionalities, such as interactive query answering, grammar check, paraphrasing, and document template matching algorithm. The focus will be on using advanced technologies, like natural language processing and pattern recognition, to improve the tool's performance in document processing and management. "

In [None]:
def qna():

  from typing_extensions import Concatenate
  # read text from pdf
  menu_option = 5
  extracted_text = upload_pdf_and_convert(menu_option)
  raw_text=""
  for content in extracted_text.values():
      raw_text += content if content else ""

  # We need to split the text using Character Text Split such that it sshould not increse token size
  text_splitter = CharacterTextSplitter(
      separator = "\n",
      chunk_size = 800,
      chunk_overlap  = 200,
      length_function = len,
  )
  texts = text_splitter.split_text(raw_text)

  embeddings = OpenAIEmbeddings()
  document_search = FAISS.from_texts(texts, embeddings)
  document_search
  from langchain.chains.question_answering import load_qa_chain
  from langchain.llms import OpenAI
  chain = load_qa_chain(OpenAI(), chain_type="stuff")

  while True:
      query = input("Enter your question, or type 'exit' to quit: ")
      if query.lower() == "exit":
          print("Thanks for exiting!")
          break
      docs = document_search.similarity_search(query)
      result = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
      output_value = list(result.values())[0]  # Assuming there's only one value in the dictionary
      print("Answer:", output_value)  # Print the result dictionary


#Paraphrasing


model is BART (Bidirectional and Auto-Regressive Transformers) which has been shown to perform well on tasks like document summarization and paraphrasing.

We're using the BART model (facebook/bart-large-cnn) which is well-suited for handling longer text inputs.
We load the BART tokenizer and model.
The input text is tokenized with "paraphrase: " prepended to it to indicate to the model that it should paraphrase the input.
We generate paraphrased text using the BART model with generate() function, specifying num_beams for beam search and max_length for controlling the length of the output.
Finally, we decode the generated paraphrased text to get the final output.

In [None]:
# from transformers import BartForConditionalGeneration, BartTokenizer

# def paraphrase_text(input_text):
#     # Load pre-trained BART model and tokenizer
#     model_name = "facebook/bart-large-cnn"
#     tokenizer = BartTokenizer.from_pretrained(model_name)
#     model = BartForConditionalGeneration.from_pretrained(model_name)

#     # Tokenize the input text
#     input_tokenized = tokenizer.encode("paraphrase: " + input_text, return_tensors="pt", max_length=1024, truncation=True)

#     # Generate paraphrased text
#     paraphrased_ids = model.generate(input_tokenized, num_beams=4, max_length=150, early_stopping=True)

#     # Decode the generated paraphrased text
#     paraphrased_text = tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)

#     return paraphrased_text

# # Example usage
# input_text = """
#     The Document Assistant is envisioned as an all-encompassing tool designed to streamline and enhance the interaction with digital documents. In the age where data is king, our tool aims to simplify complex document processing tasks, making them more efficient and user-friendly
# """
# paraphrased_text = paraphrase_text(input_text)
# print("Original text:", input_text)
# print("Paraphrased text:", paraphrased_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Original text: 
    The Document Assistant is envisioned as an all-encompassing tool designed to streamline and enhance the interaction with digital documents. In the age where data is king, our tool aims to simplify complex document processing tasks, making them more efficient and user-friendly

Paraphrased text: The Document Assistant aims to streamline and enhance the interaction with digital documents. In the age where data is king, our tool aims to simplify complex document processing tasks, making them more efficient and user-friendly. The Document Assistant is envisioned as an all-encompassing tool.


In [None]:
# menu_option = 5
# extracted_text = upload_pdf_and_convert(menu_option)
# from typing_extensions import Concatenate
# raw_text=""
# for text in extracted_text.values():
#   raw_text += text

# paraphrased_text = paraphrase_text(raw_text)
# print("Original text:")
# print(raw_text)
# print("\nParaphrased text:")
# print(paraphrased_text)
# lines = paraphrased_text.split('.')

# # Print each line separately
# for line in lines:
#   print(line)

Saving Major project proposal- Group4 .pdf to Major project proposal- Group4  (1).pdf
Uploaded Major project proposal- Group4  (1).pdf
PDF pages converted to images in the folder: output_images


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/tokenizer.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_l

Original text:
matching to detect document fraud. These intatives will integrate advancements in natural
language processing and pattern recognition to elavate the Document Assistant's functionality.

Group-4 Ms. Ritka
Kumari

B.Tech (CSE-Al) Assistant Professor
IGDTUW 7th semester ‘Al & DS Department

Signature Signature
Document Assistant: A Comprehensive Solution for Document
Processing and Management

 

Introduction
‘The Document Assistants envisioned as an all-encompassing tool designed to streamline and
‘enhance the interaction with digital documents. In the age where data is king, aur tool aims to
‘simplify complex document processing tasks, making them more efficient and user-friendly,

Background
‘Our project intially introduced a suite of document handling features including text-to-speech,
‘summarization, and basic document classification. Moving forward, we pian to expand on these
{unctionalities, enhancing them with sophisticated machine learning techniques to meet more


 need to maintain the length of the input document while paraphrasing, you can use a method called "chunking". This involves splitting the document into smaller chunks, paraphrasing each chunk individually, and then concatenating the paraphrased chunks to form the final paraphrased document.


In [None]:
def paraphrase_text_final():
  from transformers import BartForConditionalGeneration, BartTokenizer

  def chunk_text(text, max_chunk_len=1024):
      """Split the text into chunks of maximum length max_chunk_len."""
      chunks = []
      current_chunk = ""
      words = text.split()
      for word in words:
          if len(current_chunk) + len(word) + 1 <= max_chunk_len:
              current_chunk += word + " "
          else:
              chunks.append(current_chunk.strip())
              current_chunk = word + " "
      if current_chunk:
          chunks.append(current_chunk.strip())
      return chunks

  def paraphrase_text_f(input_text):
      # Load pre-trained BART model and tokenizer
      model_name = "facebook/bart-large-cnn"
      tokenizer = BartTokenizer.from_pretrained(model_name)
      model = BartForConditionalGeneration.from_pretrained(model_name)

      # Chunk the input text
      text_chunks = chunk_text(input_text)

      # Paraphrase each chunk
      paraphrased_chunks = []
      for chunk in text_chunks:
          input_tokenized = tokenizer.encode("paraphrase: " + chunk, return_tensors="pt", max_length=1024, truncation=True)
          paraphrased_ids = model.generate(input_tokenized, num_beams=4, max_length=150, early_stopping=True)
          paraphrased_text = tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
          paraphrased_chunks.append(paraphrased_text)

      # Concatenate paraphrased chunks
      paraphrased_text = " ".join(paraphrased_chunks)

      return paraphrased_text

  menu_option = 5
  extracted_text = upload_pdf_and_convert(menu_option)
  from typing_extensions import Concatenate
  raw_text=""
  for text in extracted_text.values():
    raw_text += text

  paraphrased_text = paraphrase_text_f(raw_text)
  # print("Original text:")
  # print(raw_text)
  # print("\nParaphrased text:")
  # print(paraphrased_text)
  lines = paraphrased_text.split('.')

  # Print each line separately
  for line in lines:
    print(line)
paraphrase_text_final()


Saving new doc.pdf to new doc.pdf
Uploaded new doc.pdf
PDF pages converted to images in the folder: output_images


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/tokenizer.json


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": fals

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/model.safetensors
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}

All model checkpoint weights were used when initializing BartForConditionalGeneration.

All the weights of BartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BartForConditionalGeneration for predictions without further training.


generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}



Values in a Python dictionary can be accessed by placing the key within square brackets next to the dictionary
 If the key already exists, the old value will be overwritten
 ‘Attempting to access a value with a key that does not exist will cause a KeyError’


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Download the 'punkt' resource for tokenization
nltk.download('punkt')

def calculate_bleu(reference_text, generated_text):
    # Tokenize the reference and generated texts
    reference_tokens = [nltk.word_tokenize(reference_text)]
    generated_tokens = nltk.word_tokenize(generated_text)

    # Use smoothing function for BLEU score calculation
    smoothing_function = SmoothingFunction().method1

    # Calculate BLEU score
    bleu_score = sentence_bleu(reference_tokens, generated_tokens, smoothing_function=smoothing_function)

    return bleu_score




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Example usage
reference = "The quick brown fox jumps over the lazy dog."
generated = "A fast brown fox leaps over a lazy dog."

bleu_score = calculate_bleu(reference, generated)
print(f"BLEU Score: {bleu_score:.4f}")

BLEU Score: 0.1375


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import random

nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
#paraphrase_text function takes the extracted text from the PDF as input, tokenizes it into words, and then finds synonyms for each word using WordNet, which is a lexical database of semantic relations between words in more than 200 languages. Finally, it constructs a paraphrased text by randomly choosing synonyms for each word.

In [None]:

# def paraphrase_text(text):
#     tokens = word_tokenize(text)
#     lemmatizer = WordNetLemmatizer()

#     paraphrased_text = []

#     for token in tokens:
#         synonyms = []
#         for syn in wordnet.synsets(token):
#             for lemma in syn.lemmas():
#                 synonyms.append(lemma.name())
#         if synonyms:
#             paraphrased_text.append(random.choice(synonyms))
#         else:
#             paraphrased_text.append(token)

#     return ' '.join(paraphrased_text)

# # Example usage
# #text = " The Document Assistant is envisioned as an all-encompassing tool designed to streamline and enhance the interaction with digital documents. In the age where data is king, our tool aims to simplify complex document processing tasks, making them more efficient and user-friendly"

# menu_option = 5
# extracted_text = upload_pdf_and_convert(menu_option)
# from typing_extensions import Concatenate
# raw_text=""
# for text in extracted_text.values():
#   raw_text += text

# paraphrased_text = paraphrase_text(raw_text)
# print("Original text:")
# print(raw_text)
# print("\nParaphrased text:")
# lines = paraphrased_text.split('.')

# # Print each line separately
# for line in lines:
#     print(line)

# #print(paraphrased_text)


Saving Major project proposal- Group4 .pdf to Major project proposal- Group4  (2).pdf
Uploaded Major project proposal- Group4  (2).pdf
PDF pages converted to images in the folder: output_images
Original text:
Document Assistant: A Comprehensive Solution for Document
Processing and Management

 

Introduction
‘The Document Assistants envisioned as an all-encompassing tool designed to streamline and
‘enhance the interaction with digital documents. In the age where data is king, aur tool aims to
‘simplify complex document processing tasks, making them more efficient and user-friendly,

Background
‘Our project intially introduced a suite of document handling features including text-to-speech,
‘summarization, and basic document classification. Moving forward, we pian to expand on these
{unctionalities, enhancing them with sophisticated machine learning techniques to meet more
‘complex challenges. Previously, we've successfully used Tesseract for extracting text and
Hugging Face transformers

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer


In [None]:
def paraphrase_text(input_text):
    # Load the Pegasus model and tokenizer
    model_name = "tuner007/pegasus_paraphrase"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer([input_text], max_length=1024, truncation=True, return_tensors="pt")

    # Generate paraphrased text
    paraphrased_ids = model.generate(inputs["input_ids"], max_length=1024, num_beams=5, early_stopping=True)
    paraphrased_text = tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)

    return paraphrased_text

In [None]:
def paraphrase_extracted_text(extracted_text):
    paraphrased_text = {}
    for file_name, text in extracted_text.items():
        paraphrased_text[file_name] = paraphrase_text(text)
    return paraphrased_text

In [None]:
def click_paraphrase():
    print("\nParaphrase option chosen")
    menu_option = 5  # Assuming this is the menu option for paraphrasing
    extracted_text = upload_pdf_and_convert(menu_option)

    paraphrased_text = paraphrase_extracted_text(extracted_text)

    cleaned_text = [text.replace('\n', '') for text in extracted_text]
    print("\nExtracted text : ", cleaned_text)

    for file_name, paraphrased in paraphrased_text.items():
        print(f"Paraphrased version of {file_name}:\n{paraphrased}\n{'-'*50}")

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

def paraphrase_text(input_text):
    # Load the Pegasus model and tokenizer
    model_name = "tuner007/pegasus_paraphrase"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer([input_text], max_length=1024, truncation=True, return_tensors="pt")

    # Generate paraphrased text
    paraphrased_ids = model.generate(inputs["input_ids"], max_length=1024, num_beams=5, early_stopping=True)
    paraphrased_text = tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)

    return paraphrased_text

def paraphrase_extracted_text(extracted_text):
    paraphrased_text = {}
    for file_name, text in extracted_text.items():
        paraphrased_text[file_name] = paraphrase_text(text)
    return paraphrased_text

def click_paraphrase():
    print("\nParaphrase option chosen")
    menu_option = 5  # Assuming this is the menu option for paraphrasing
    extracted_text = upload_pdf_and_convert(menu_option)

    paraphrased_text = paraphrase_extracted_text(extracted_text)

    cleaned_text = [text.replace('\n', '') for text in extracted_text]
    print("\nExtracted text : ", cleaned_text)

    for file_name, paraphrased in paraphrased_text.items():
        print(f"Paraphrased version of {file_name}:\n{paraphrased}\n{'-'*50}")

# Assume the rest of your code remains the same


In [None]:
def main_menu():
    print("Welcome to the Main Menu")
    print("1. Read Aloud")
    print("2. Document Summary")
    print("3. Document Classification")
    print("4. Exit")
    print("5. Paraphrase")

    while True:
        try:
            option = int(input("Please enter the option number: "))
            if option == 1:
                click_read_aloud()
            elif option == 2:
                click_document_summary()
            elif option == 3:
                click_document_classification()
            elif option == 4:
                print("Exiting the program. Goodbye!")
                break  # Exit the loop
            elif option == 5:
                click_paraphrase()
            else:
                print("\n # Invalid option. Please enter a number between 1 and 5.")
        except ValueError:
            print("\n # Invalid input. Please enter a number.")

# Calling the menu function
main_menu()

Welcome to the Main Menu
1. Read Aloud
2. Document Summary
3. Document Classification
4. Exit
5. Paraphrase
Please enter the option number: 5

Paraphrase option chosen


Saving Untitled document-6.pdf to Untitled document-6 (2).pdf
Uploaded Untitled document-6 (2).pdf
PDF pages converted to images in the folder: output_images


loading file spiece.model from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/tokenizer_config.json
loading file tokenizer.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/config.json
Model config PegasusConfig {
  "_name_or_path": "tuner007/pegasus_paraphrase",
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  

IndexError: index out of range in self

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

def paraphrase_text(input_text):
    # Load the Pegasus model and tokenizer
    model_name = "tuner007/pegasus_paraphrase"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer([input_text], max_length=1024, truncation=True, return_tensors="pt")

    # Generate paraphrased text
    paraphrased_ids = model.generate(inputs["input_ids"], max_length=256, num_beams=5, early_stopping=True)
    paraphrased_text = tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)

    return paraphrased_text

def paraphrase_extracted_text(extracted_text):
    paraphrased_text = {}
    for file_name, text in extracted_text.items():
        paraphrased_text[file_name] = paraphrase_text(text)
    return paraphrased_text

def click_paraphrase():
    print("\nParaphrase option chosen")
    menu_option = 5  # Assuming this is the menu option for paraphrasing
    extracted_text = upload_pdf_and_convert(menu_option)

    if extracted_text:
        paraphrased_text = paraphrase_extracted_text(extracted_text)

        cleaned_text = [text.replace('\n', '') for text in extracted_text.values()]
        print("\nExtracted text : ", cleaned_text)

        for file_name, paraphrased in paraphrased_text.items():
            print(f"Paraphrased version of {file_name}:\n{paraphrased}\n{'-'*50}")

def main_menu():
    print("Welcome to the Main Menu")
    print("1. Read Aloud")
    print("2. Document Summary")
    print("3. Document Classification")
    print("4. Exit")
    print("5. Paraphrase")

    while True:
        try:
            option = int(input("Please enter the option number: "))
            if option == 1:
                click_read_aloud()
            elif option == 2:
                click_document_summary()
            elif option == 3:
                click_document_classification()
            elif option == 4:
                print("Exiting the program. Goodbye!")
                break  # Exit the loop
            elif option == 5:
                click_paraphrase()
            else:
                print("\n # Invalid option. Please enter a number between 1 and 5.")
        except ValueError:
            print("\n # Invalid input. Please enter a number.")

# Calling the menu function
main_menu()


Welcome to the Main Menu
1. Read Aloud
2. Document Summary
3. Document Classification
4. Exit
5. Paraphrase
Please enter the option number: 5

Paraphrase option chosen


NameError: name 'upload_pdf_and_convert' is not defined

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def paraphrase_text(input_text):
    # Load the paraphrasing model
    tokenizer = AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
    model = AutoModelForSeq2SeqLM.from_pretrained("tuner007/pegasus_paraphrase")

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate paraphrased text
    paraphrased_ids = model.generate(inputs["input_ids"], max_length=1024, num_return_sequences=1, early_stopping=True)
    paraphrased_text = tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)

    return paraphrased_text

def paraphrase_extracted_text(extracted_text):
    paraphrased_text = {}
    for file_name, text in extracted_text.items():
        paraphrased_text[file_name] = paraphrase_text(text)
    return paraphrased_text

def click_paraphrase():
    print("\nParaphrase option chosen")
    menu_option = 4
    extracted_text = upload_pdf_and_convert(menu_option)

    paraphrased_text = paraphrase_extracted_text(extracted_text)

    cleaned_text = [text.replace('\n', '') for text in extracted_text]
    print("\nExtracted text : ", cleaned_text)

    for file_name, paraphrased in paraphrased_text.items():
        print(f"Paraphrased version of {file_name}:\n{paraphrased}\n{'-'*50}")

def main_menu():
    print("Welcome to the Main Menu")
    print("1. Read Aloud")
    print("2. Document Summary")
    print("3. Document Classification")
    print("4. Paraphrase")
    print("5. Exit")

    while True:
        try:
            option = int(input("Please enter the option number: "))
            if option == 1:
                click_read_aloud()
            elif option == 2:
                click_document_summary()
            elif option == 3:
                click_document_classification()
            elif option == 4:
                click_paraphrase()
            elif option == 5:
                exit_menu_page()
                break  # Exit the loop
            else:
                print("\n # Invalid option. Please enter a number between 1 and 5.")
        except ValueError:
            print("\n # Invalid input. Please enter a number.")

# Calling the menu function
main_menu()


In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

# Load BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

def paraphrase_text(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)
    paraphrased_ids = model.generate(input_ids, num_return_sequences=1, max_length=150, early_stopping=True)
    paraphrased_text = tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
    return paraphrased_text

# Your existing code for document processing functions goes here...

# Modify main menu function to include paraphrasing option
def main_menu():
    print("Welcome to the Main Menu")
    print("1. Read Aloud")
    print("2. Document Summary")
    print("3. Document Classification")
    print("4. Exit")
    print("5. Paraphrase")

    while True:
        try:
            option = int(input("Please enter the option number: "))
            if option == 1:
                click_read_aloud()
            elif option == 2:
                click_document_summary()
            elif option == 3:
                click_document_classification()
            elif option == 4:
                exit_menu_page()
                break  # Exit the loop
            elif option == 5:
                paraphrase_menu()
            else:
                print("\n # Invalid option. Please enter a number between 1 and 5.")
        except ValueError:
            print("\n # Invalid input. Please enter a number.")

# Add function for paraphrasing menu
def paraphrase_menu():
    print("\nParaphrase option chosen\n")
    menu_option = 1  # Assuming we always use the extracted text for paraphrasing
    extracted_text = upload_pdf_and_convert(menu_option)

    cleaned_text = [text.replace('\n', ' ') for text in extracted_text.values()]
    combined_text = ' '.join(cleaned_text)

    paraphrased_text = paraphrase_text(combined_text)
    print("\nParaphrased text:\n", paraphrased_text)

# Your existing code for other functions goes here...

# Calling the menu function
main_menu()


In [None]:
# Import necessary modules for paraphrasing
from transformers import pipeline

# Define function for paraphrasing
def paraphrase_text(text):
    paraphraser = pipeline("text2text-generation", model="t5-small", tokenizer="t5-small")
    paraphrased_text = paraphraser(text, max_length=1000, num_return_sequences=1)[0]['generated_text']
    return paraphrased_text

# Modify main menu function to include paraphrasing option
def main_menu():
    print("Welcome to the Main Menu")
    print("1. Read Aloud")
    print("2. Document Summary")
    print("3. Document Classification")
    print("4. Exit")
    print("5. Paraphrase")

    while True:
        try:
            option = int(input("Please enter the option number: "))
            if option == 1:
                click_read_aloud()
            elif option == 2:
                click_document_summary()
            elif option == 3:
                click_document_classification()
            elif option == 4:
                exit_menu_page()
                break  # Exit the loop
            elif option == 5:
                paraphrase_menu()
            else:
                print("\n # Invalid option. Please enter a number between 1 and 5.")
        except ValueError:
            print("\n # Invalid input. Please enter a number.")

# Add function for paraphrasing menu
def paraphrase_menu():
    print("\nParaphrase option chosen\n")
    menu_option = 1  # Assuming we always use the extracted text for paraphrasing
    extracted_text = upload_pdf_and_convert(menu_option)

    cleaned_text = [text.replace('\n', ' ') for text in extracted_text.values()]
    combined_text = ' '.join(cleaned_text)

    paraphrased_text = paraphrase_text(combined_text)
    print("\nParaphrased text:\n", paraphrased_text)

# Calling the menu function
main_menu()


Welcome to the Main Menu
1. Read Aloud
2. Document Summary
3. Document Classification
4. Exit
5. Paraphrase
Please enter the option number: 5

Paraphrase option chosen



Saving Untitled document-6.pdf to Untitled document-6 (1).pdf
Uploaded Untitled document-6 (1).pdf
PDF pages converted to images in the folder: output_images


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/model.safetensors
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/generation_config.json
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/tokenizer_config.json
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}




Paraphrased text:
 ,, do the needful to deactivate my MTNL landline connection, but I do not require this landline connection anymore.


KeyboardInterrupt: Interrupted by user

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast

# Initialize the Pegasus model and tokenizer
model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")

def get_paraphrased_sentences(sentence, num_return_sequences=5, num_beams=5):
    # Generate paraphrased sentences
    inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
    outputs = model.generate(
        **inputs,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        early_stopping=True
    )
    # Decode the generated sentences using the tokenizer
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def paraphrase_sentences(sentences):
    paraphrased_sentences = []
    for sentence in sentences:
        paraphrased_sentences.extend(get_paraphrased_sentences(sentence))
    return paraphrased_sentences

def final_paraphrased():
    menu_option = 5
    extracted_text = upload_pdf_and_convert(menu_option)
    cleaned_text = [text.replace('\n', '') for text in extracted_text]
    print("\nExtracted text : ", cleaned_text)

    # Paraphrase the extracted sentences
    paraphrased_text = paraphrase_sentences(cleaned_text)
    return paraphrased_text

In [None]:
# from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast
# import pytesseract
# from PIL import Image

# # Initialize the Pegasus model and tokenizer
# model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
# tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")

# def perform_ocr(pdf_path):
#     images = convert_pdf_to_images(pdf_path)
#     extracted_text = []
#     for img_path in images:
#         extracted_text.append(pytesseract.image_to_string(Image.open(img_path)))
#     return ' '.join(extracted_text)

# def get_paraphrased_sentences(sentence, num_return_sequences=5, num_beams=5):
#     # Generate paraphrased sentences
#     inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
#     outputs = model.generate(
#         **inputs,
#         num_beams=num_beams,
#         num_return_sequences=num_return_sequences,
#         early_stopping=True
#     )
#     # Decode the generated sentences using the tokenizer
#     return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# def paraphrase_sentences(sentences):
#     paraphrased_sentences = []
#     for sentence in sentences:
#         paraphrased_sentences.extend(get_paraphrased_sentences(sentence))
#     return paraphrased_sentences

# def final_paraphrased():
#     menu_option = 5
#     pdf_path = "example.pdf"  # Update with your PDF path
#     extracted_text = perform_ocr(pdf_path)
#     print("\nExtracted text : ", extracted_text)

#     # Paraphrase the extracted text
#     paraphrased_text = paraphrase_sentences([extracted_text])
#     return paraphrased_text

# # Update main_menu() function
# def main_menu():
#     print("Welcome to the Main Menu")
#     print("1. Read Aloud")
#     print("2. Document Summary")
#     print("3. Document Classification")
#     print("4. Exit")
#     print("5. Paraphrase")

#     while True:
#         try:
#             option = int(input("Please enter the option number: "))
#             if option == 1:
#                 click_read_aloud()
#             elif option == 2:
#                 click_document_summary()
#             elif option == 3:
#                 click_document_classification()
#             elif option == 4:
#                 exit_menu_page()
#                 break  # Exit the loop
#             elif option == 5:
#                 paraphrased_text = final_paraphrased()
#                 print("Paraphrased Text:")
#                 for sentence in paraphrased_text:
#                     print(sentence)
#             else:
#                 print("\n # Invalid option. Please enter a number between 1 and 5.")
#         except ValueError:
#             print("\n # Invalid input. Please enter a number.")

# # Calling the menu function
# main_menu()


# Main Menu

In [None]:
# Update main_menu() function
def main_menu():
    print("Welcome to the Main Menu")
    print("1. Read Aloud")
    print("2. Document Summary")
    print("3. Document Classification")
    print("4. Question Answering")
    print("5. Paraphrase")
    print("6. Grammar Check")
    print("7. Exit")

    while True:
        try:
            option = int(input("Please enter the option number: "))
            if option == 1:
                click_read_aloud()
            elif option == 2:
                click_document_summary()
            elif option == 3:
                click_document_classification()
            elif option == 4:
                qna()
            elif option == 5:
                paraphrase_text_final()
            elif option == 6:
                click_check_grammar()
            elif option == 7:
                exit_menu_page()
                break  # Exit the loop
            else:
                print("\n # Invalid option. Please enter a number between 1 and 5.")
        except ValueError:
            print("\n # Invalid input. Please enter a number.")
def exit_menu_page():
    print("Exiting!!")

# Calling the menu function
main_menu()


Welcome to the Main Menu
1. Read Aloud
2. Document Summary
3. Document Classification
4. Question Answering
5. Paraphrase
6. Grammar Check
7. Exit
Please enter the option number: 4


Saving republic day.pdf to republic day (2).pdf
Uploaded republic day (2).pdf
PDF pages converted to images in the folder: output_images


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
!pip install -U langchain-openai
from langchain_openai import OpenAIEmbeddings

Collecting langchain-openai
  Downloading langchain_openai-0.1.7-py3-none-any.whl (34 kB)
Installing collected packages: langchain-openai
Successfully installed langchain-openai-0.1.7


In [None]:
from difflib import SequenceMatcher

def calculate_similarity(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

def calculate_accuracy(ground_truth_answers, generated_answers, confidence_scores):
    total_questions = len(ground_truth_answers)
    correct_answers = 0
    confidence_per_question = {}

    for question, ground_truth_answer in ground_truth_answers.items():
        generated_answer = generated_answers.get(question, None)
        confidence_score = confidence_scores.get(question, None)

        if generated_answer:
            similarity_score = calculate_similarity(generated_answer.lower(), ground_truth_answer.lower())
            print("Similarity Score:", similarity_score)
            if similarity_score >= 0:  # Adjust the threshold as needed
                correct_answers += 1
                if confidence_score is not None:
                    confidence_per_question[question] = confidence_score

    accuracy = (correct_answers / total_questions) * 100
    return accuracy, confidence_per_question


# Example usage:
# (Assuming the rest of the code remains the same)

ground_truth_answers = {}
generated_answers = {}
confidence_scores = {}

# Get user-generated data
num_questions = int(input("Enter the number of questions: "))
for i in range(num_questions):
    question = input("Enter the question: ")
    ground_truth_answer = input("Enter the ground truth answer: ")
    generated_answer = input("Enter the generated answer: ")
    # Generate confidence score (example: randomly between 0 and 1)
    confidence_score = random.uniform(0, 1)

    ground_truth_answers[question] = ground_truth_answer
    generated_answers[question] = generated_answer
    confidence_scores[question] = confidence_score

# Calculate accuracy and confidence score for each question
accuracy, confidence_per_question = calculate_accuracy(ground_truth_answers, generated_answers, confidence_scores)
print("Accuracy: {:.2f}%".format(accuracy))
print("Confidence Scores:")
for question, confidence_score in confidence_per_question.items():
    print("- Question:", question)
    print("  Confidence Score:", confidence_score)


Enter the number of questions: 1
Enter the question: what is the main objective of the project
Enter the ground truth answer: The main objective of the Document Assistant is to expand its capabilities, transforming it into a more versatile and robust tool. The focus will be on refining the existing features and introducing new functionalities to cover a wider spectrum of document processing tasks.
Enter the generated answer: The objective of the project is to expand the capabilities of the Document Assistant, transforming it into a more versatile and robust tool for document processing and management. This will be achieved by refining existing features and introducing new functionalities to cover a wider spectrum of document processing tasks. The proposed methodology involves creating a robust framework that can integrate advanced document processing and management capabilities, such as a classification model with improved accuracy, an interactive query system, and a grammar checking t

In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np

def calculate_semantic_similarity(str1, str2):
    """
    Calculate semantic similarity score between two sentences using BERT embeddings.
    """
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    embeddings1 = model.encode([str1])
    embeddings2 = model.encode([str2])
    cosine_sim = np.dot(embeddings1[0], embeddings2[0]) / (np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0]))
    return cosine_sim

def calculate_confidence_score(generated_answer, ground_truth_answer):
    """
    Calculate confidence score based on semantic similarity between generated and ground truth answers.
    """
    similarity_score = calculate_semantic_similarity(generated_answer.lower(), ground_truth_answer.lower())
    confidence_score = (similarity_score + 1) * 50  # Scaling cosine similarity score to range [0, 100]
    return confidence_score


# Example usage:
ground_truth_answers = {}
generated_answers = {}

# Get user-generated data
num_questions = int(input("Enter the number of questions: "))
for i in range(num_questions):
    question = input("Enter the question: ")
    ground_truth_answer = input("Enter the ground truth answer: ")
    generated_answer = input("Enter the generated answer: ")

    ground_truth_answers[question] = ground_truth_answer
    generated_answers[question] = generated_answer

# Calculate confidence score for each question
confidence_scores = {}
for question, generated_answer in generated_answers.items():
    ground_truth_answer = ground_truth_answers.get(question, "")
    confidence_score = calculate_confidence_score(generated_answer, ground_truth_answer)
    confidence_scores[question] = confidence_score

# Print confidence scores
print("Confidence Scores:")
for question, confidence_score in confidence_scores.items():
    print("- Question:", question)
    print("  Confidence Score:", confidence_score)


Enter the number of questions: 1
Enter the question: Ethical Issues and Safety in the Use of considerations in clinical decision support?
Enter the ground truth answer: Questions arise about how much doctors should rely on CDSS and who's accountable if something goes wrong. ● There's debate about whether patients should give permission before CDSS are used and how to handle breaches of patient confidentiality. ● It's tricky to balance patient privacy with the need for healthcare providers to access medical records. ● Laws and regulations help, but there's still no clear consensus on the best way to use CDSS without causing ethical problems.
Enter the generated answer: Regulatory Compliance
Confidence Scores:
- Question: Ethical Issues and Safety in the Use of considerations in clinical decision support?
  Confidence Score: 55.460911989212036


In [None]:
!pip show google-colab


Name: google-colab
Version: 1.0.0
Summary: Google Colaboratory tools
Home-page: https://colaboratory.research.google.com/
Author: Google Colaboratory team
Author-email: colaboratory-team@google.com
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: google-auth, ipykernel, ipython, notebook, pandas, portpicker, requests, tornado
Required-by: 
