# Import relevant libraries

In [1]:
# ! pip install PyPDF2 pytesseract 
# ! pip install pymupdf
# ! pip install langchain
# ! pip install pathlib
# ! pip install sentence-transformers
# ! pip install glob2

In [1]:
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
import os

import re
import glob
# from pathlib import Path

from langchain.text_splitter import TextSplitter, SentenceTransformersTokenTextSplitter

# Defining functions 

In [12]:
def extract_text_with_ocr(pdf_path):
    text = ""
    pdf = PyPDF2.PdfReader(pdf_path)

    # Convert PDF pages to images using pdf2image
    images = convert_from_path(pdf_path, poppler_path = r"C:\Users\User\Desktop\pdf_parser\venv_pdf\Release-23.01.0-0\poppler-23.01.0\Library\bin")

    pytesseract.pytesseract.tesseract_cmd = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
    
    # Iterate over each page
    for page_num, image in enumerate(images):
        # Perform OCR on the image using pytesseract
        page_text = pytesseract.image_to_string(image, lang='eng')

        # Identify and exclude text from diagrams
        page_text = remove_diagram_text(page_text)

        # Append the extracted text to the overall text
        text += page_text

    return text

def remove_diagram_text(page_text):
    # Add your custom logic to identify and remove text from diagrams
    # You can use regex, string operations, or other techniques to identify and exclude text from diagrams

    # For example, you can define a list of keywords or patterns commonly found in diagram text
    diagram_keywords = ['diagram', 'chart', 'figure', 'graph']

    # Split the page text into lines
    lines = page_text.split('\n')

    # Iterate over each line and exclude lines containing diagram keywords
    filtered_lines = [line for line in lines if not any(keyword in line.lower() for keyword in diagram_keywords)]

    # Join the filtered lines back into a single string
    filtered_text = '\n'.join(filtered_lines)

    return filtered_text

# def load_docs(filename):
#     document = ''
#     loader = PyPDF2.PdfReader(filename)

#     # finding max pages
#     doc_page = len(loader.pages)

#     #iterate through the pages and combine
#     for i in range(doc_page):
#       page = loader.pages[i]
#       document += page.extract_text() + ""
#     return document

def text_chunker(file) :
  # initiate TextSplitter class and input chunk size and overlap
  text_splitter = SentenceTransformersTokenTextSplitter(
      model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
      tokens_per_chunk = 200,
      chunk_overlap = 30
  )

  # pass in the file to be split
  # output is iterable
  texts = text_splitter.split_text(file)
  return texts

def clean_text(text) :
    # replace /n with whitespace
    # text = text.replace("\n","")

    # replace double whitespace with single whitespace    
    text = text.replace("  ", " ")

    # remove [i], where i is the number inside of a reference point throughout the research paper
    text = re.sub("\[.*?\]", '', text)

    # exclude the References portion of the research paper
    # some References are not exact (due to pdf rendering or page text alignment) so we need to do fuzzy matching
    token_end = "ACKNOWLEDGEMENTS"
    try : 
        stripped_text_v1 = text.split(token_end, 1)[0]
    except : 
        token_end = "Acknowledgements"
        stripped_text_v1 = text.split(token_end, 1)[0]

    # exclude the Introduction portion of the research paper
    # some Introduction are not exact (due to pdf rendering or page text alignment) so we need to do fuzzy matching
    token_intro = "ABSTRACT"
    try :
        stripped_text_v2 = stripped_text_v1.split(token_intro, 1)[1]
    except :
        token_intro = "Abstract"
        stripped_text_v2 = stripped_text_v1.split(token_intro, 1)[1]
  

    return stripped_text_v2

## Define a path to a folder with all of the PDF files

In [13]:
path = 'pdf_files/*.pdf'
pdf_list = glob.glob(path)

In [14]:
pdf_list

['research_paper\\ECGBERT.pdf',
 'research_paper\\enhance_instruction.pdf',
 'research_paper\\FLAN.pdf',
 'research_paper\\FLANv2.pdf',
 'research_paper\\FLAN_betterdesign.pdf',
 'research_paper\\HomoGCL.pdf',
 'research_paper\\KiDS-1000.pdf',
 'research_paper\\LoRA.pdf',
 'research_paper\\Med-MMHL.pdf',
 'research_paper\\MIXALIME.pdf',
 'research_paper\\MoleCLUEs.pdf',
 'research_paper\\OpenGSL.pdf',
 'research_paper\\PoET.pdf',
 'research_paper\\self_instruct.pdf',
 'research_paper\\SpreadDetect.pdf',
 'research_paper\\UltraLlama.pdf']

## Creating prompts 

In [15]:
final_chunk_list = []

prompt = "Based on the text, can you generate 5 different question and answer pairs in the following format.\nAnswer format: \n1. QUESTION : {vicuna to insert question}, \nANSWER: {vicuna to insert answer} \nText :"


for pdf in pdf_list :
    # load pdf
    document = extract_text_with_ocr(pdf)

    # preprocess
    clean_doc = clean_text(document)

  # chunk doc
    chunked_text_list = text_chunker(clean_doc)

    for chunk in chunked_text_list :
        final_chunk = prompt + "\n" + chunk
        final_chunk_list.append(final_chunk)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)16ebc/.gitattributes: 100%|██████████| 737/737 [00:00<00:00, 82.5kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 11.3kB/s]
Downloading (…)b6b5d16ebc/README.md: 8.65kB [00:00, 1.08MB/s]
Downloading (…)b5d16ebc/config.json: 100%|██████████| 571/571 [00:00<00:00, 32.8kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 12.9kB/s]
Downloading (…)ebc/data_config.json: 25.5kB [00:00, ?B/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:22<00:00, 19.3MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 4.02kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 29.8kB/s]
Downloading (…)16ebc/tokenizer.json: 466kB [00:00, 1.24MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 45.8kB/s]
Downloading (…)6ebc/train_script.py: 13.9kB [00:00, ?B/s]
Downloading (…)b6b5d16eb

In [16]:
final_chunk_list[0]

'Based on the text, can you generate 10 different question and answer pairs in the following format.\nAnswer format: \n1. QUESTION : {vicuna to insert question}, \nANSWER: {vicuna to insert answer} \nText :\nin the medical field, current ecg signal analysis approaches rely on supervised deep neural networks trained for specific tasks that require substantial amounts of labeled data. however, our paper introduces ecgbert, a self - supervised representation learning approach that unlocks the underlying language of ecgs. by unsupervised pre - training of the model, we mitigate challenges posed by the lack of well - labeled and curated medical data. ecgbert, inspired by advances in the area of natural language processing and large language models, can be fine - tuned with minimal additional layers for various ecg - based problems. through four tasks, including atrial fibrillation arrhythmia detection, heartbeat classification, sleep apnea detection, and user authentication, we demonstrate 

In [14]:
pdf = extract_text_with_ocr('research_paper/FLAN.pdf')
pdf_clean = clean_text(pdf)
print(pdf_clean)



This paper explores a simple method for improving the zero-shot learning abilities
of language models. We show that instruction tuning—finetuning language models
on a collection of datasets described via instructions—substantially improves zero-
shot performance on unseen tasks.

We take a 137B parameter pretrained language model and instruction tune it on
over 60 NLP datasets verbalized via natural language instruction templates. We
evaluate this instruction-tuned model, which we call FLAN, on unseen task types.
FLAN substantially improves the performance of its unmodified counterpart and
surpasses zero-shot 175B GPT-3 on 20 of 25 datasets that we evaluate. FLAN even
outperforms few-shot GPT-3 by a large margin on ANLI, RTE, BoolQ, AI2-ARC,
OpenbookQA, and StoryCloze. Ablation studies reveal that number of finetuning
datasets, model scale, and natural language instructions are key to the success of
instruction tuning.

Finetune on many tasks (“instruction-tuning”)

Input (Commonsens

In [10]:
print(extract_text_with_ocr('research_paper/FLAN.pdf'))

Published as a conference paper at ICLR 2022

FINETUNED LANGUAGE MODELS ARE ZERO-SHOT
LEARNERS

Jason Wei*, Maarten Bosma*, Vincent Y. Zhao*, Kelvin Guu*, Adams Wei Yu,
Brian Lester, Nan Du, Andrew M. Dai, and Quoc V. Le

Google Research

ABSTRACT

This paper explores a simple method for improving the zero-shot learning abilities
of language models. We show that instruction tuning—finetuning language models
on a collection of datasets described via instructions—substantially improves zero-
shot performance on unseen tasks.

We take a 137B parameter pretrained language model and instruction tune it on
over 60 NLP datasets verbalized via natural language instruction templates. We
evaluate this instruction-tuned model, which we call FLAN, on unseen task types.
FLAN substantially improves the performance of its unmodified counterpart and
surpasses zero-shot 175B GPT-3 on 20 of 25 datasets that we evaluate. FLAN even
outperforms few-shot GPT-3 by a large margin on ANLI, RTE, BoolQ, AI2-ARC,

# Dumping the outputs into a JSON file

In [17]:
import json

with open('output.json', 'w') as f:
    json.dump(final_chunk_list, f, indent=4)