# Import libraries

In [1]:
# !pip install llama-index
# !pip install langchain
# !pip -q install huggingface chromadb transformers langchain InstructorEmbedding
# !pip install PyPDF2
# !pip install peft
# !pip install git+https://github.com/huggingface/transformers
# !pip install pypdf
# !pip install pathlib
# !pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting llama-index
  Downloading llama_index-0.6.26-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.2/510.2 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from llama-index)
  Downloading dataclasses_json-0.5.8-py3-none-any.whl (26 kB)
Collecting langchain>=0.0.154 (from llama-index)
  Downloading langchain-0.0.200-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sqlalchemy>=2.0.15 (from llama-index)
  Downloading SQLAlchemy-2.0.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=0.26.4 (from llama-index)
  Downloading openai-0.27.8-py3-none-a

In [2]:
import PyPDF2
import re
import os 
import nltk
import glob
from pathlib import Path

from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline, AutoTokenizer, AutoConfig
from llama_index import SimpleDirectoryReader, ServiceContext, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
#from langchain.indexes import VectorstoreIndexCreator
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TextSplitter, SentenceTransformersTokenTextSplitter
import sys
from IPython.display import Markdown, display
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
import time

llama_model = None
llama_tokenizer = None
device = "cuda" #"cpu"
hfEmbed = None
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Read pdf files and iterate through the pdf files 

In [3]:
def load_docs(filename):
    document = ''
    loader = PyPDF2.PdfReader(filename)
    
    # finding max pages
    doc_page = len(loader.pages)

    #iterate through the pages and combine
    for i in range(doc_page):
      page = loader.pages[i]
      document += page.extract_text() + ""
    return document


def clean_text(text) :
  # replace /n with whitespace
  text = text.replace("\n","")

  # replace double whitespace with single whitespace
  text = text.replace("  ", " ")

  # remove [i], where i is the number inside of a reference point throughout the research paper
  text = re.sub("\[.*?\]", '', text)

  # exclude the References portion of the research paper
  # some References are not exact (due to pdf rendering or page text alignment) so we need to do fuzzy matching
  token = "REFERENCES"
  stripped_text = text.split(token, 1)[0]


  return stripped_text



def text_chunker(file) :
  # initiate TextSplitter class and input chunk size and overlap
  text_splitter = SentenceTransformersTokenTextSplitter(
      model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
      tokens_per_chunk = 500,
      chunk_overlap = 30
  )

  # pass in the file to be split
  # output is iterable
  texts = text_splitter.split_text(file)
  return texts


# get a list of all the pdf files 
# change file path accordingly
path = '/content/pdf_files/*.pdf'
pdf_list = glob.glob(path) 

## Steps :

1.   Iterate list of PDF files

2.   Preprocess the PDF document

3.   Chunk the document

4.   Store the preprocessed chunks into a list


In [4]:
final_chunk_list = []

prompt = "Based On the text, can you generate 10 different question and answer pairs in the following format.\nAnswer format: \n1. QUESTION : {vicuna to insert question}, \nANSWER: {vicuna to insert answer} \nText :"


for pdf in pdf_list :
  # load pdf
  document = load_docs(pdf)

  # preprocess
  clean_doc = clean_text(document)

  # chunk doc
  chunked_text_list = text_chunker(clean_doc)

  for chunk in chunked_text_list :
    final_chunk = prompt + "\n" + chunk
    final_chunk_list.append(final_chunk)

Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [5]:
print(final_chunk_list[0])

Based On the text, can you generate 10 different question and answer pairs in the following format.
 Answer format: 
 1. QUESTION : {vicuna to insert question}, 
 ANSWER: {vicuna to insert answer} 
 Text :
the flan collection : designing data and methodsfor [UNK] instruction tuningshayne longpre∗le hou tu vu albert webson hyung won chungyi tay denny zhou quoc v. le barret zoph jason wei adam robertsgoogle researchabstractwe study the design decisions of publicly available instruction tuning methods, and break down thedevelopment of flan 2022 models ( chung et al., 2022 ). through careful ablation studies on the flancollection of instruction tuning tasks and methods, [UNK] - t5 to outperform prior work by 3 - 17 % + across evaluation settings. we ﬁnd task balancing and enrichmenttechniques are overlooked but critical to [UNK] instruction tuning, and in particular, training with mixedpromptsettings ( zero - shot, few - shot, andchain - of - thought ) actuallyyieldsstronger ( 2 % + ) perf

In [6]:
import json 

with open('output.json', 'w') as f:
    json.dump(final_chunk_list, f, indent=4)