Question Answer from book

In [1]:
import os
import glob
import textwrap
import time

import langchain

# loaders
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

# splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

# prompts
from langchain import PromptTemplate, LLMChain

# vector stores
from langchain.vectorstores import FAISS

# models
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

# retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print('LangChain:', langchain.__version__)

  from tqdm.autonotebook import trange


LangChain: 0.0.331


In [5]:
from PyPDF2 import PdfReader
from langchain.schema import Document


reader = PdfReader("books/Cambridge IGCSE and O Level Computer Science.pdf")
    
included_pages_intervals = [[14, 52],
                 [57, 82],
                 [87, 155],
                 [159, 188],
                 [192, 225],
                 [229, 264],
                 [270, 306],
                 [311, 348],
                 [351, 365],
                 [368, 393]]

included_pages = []
for interval in included_pages_intervals:
    l = list(range(interval[0], interval[1]+1))
    included_pages = included_pages + l


def include_page(page_number):
    one_based_page_number = page_number + 1
    if one_based_page_number in included_pages:
        return True
    else:
        return False

parts = []
def visitor_body(text, cm, tm, fontDict, fontSize):
    if fontDict is not None and '/ILTBBB+OfficinaSansStd' in fontDict['/BaseFont']:
        parts.append(text)

def extract_single_page(page):
    page.extract_text(visitor_text=visitor_body),
    text_body = "".join(parts)
    text_body = text_body.replace('\n', ' ')
    return text_body


def extract_pages(pdf_reader, source):
    documents = []
    
    for page_number, page in enumerate(pdf_reader.pages):
        if include_page(page_number):
            doc = Document(
                    page_content = extract_single_page(page),
                    metadata={"source": source, "page": page_number},
                    ) 
            if len(doc.page_content) > 100:
                documents.append(doc)
            else:
                pass
                # print('dropped page content: ' + doc.page_content)
            global parts
            parts =[]
    return documents




documents = extract_pages(reader, "Cambridge IGCSE and O Level Computer Science.pdf")

print('pages extracted: ' + str(len(documents)))


pages extracted: 263


In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 0
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 547 chunks from 263 pages


In [11]:
# %%time

# ### download embeddings model
# embeddings = HuggingFaceInstructEmbeddings(
#     model_name = 'sentence-transformers/all-MiniLM-L6-v2',
#     model_kwargs = {"device": "cpu"}
# )

# ### create embeddings and DB
# vectordb = FAISS.from_documents(
#     documents = texts, 
#     embedding = embeddings
# )

# ### persist vector database
# vectordb.save_local("faiss_index_hp")

load INSTRUCTOR_Transformer
max_seq_length  512
CPU times: user 43.6 s, sys: 3.53 s, total: 47.1 s
Wall time: 36.2 s


In [21]:
%%time

### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs = {"device": "cpu"}
)

### load vector DB embeddings
vectordb : FAISS = FAISS.load_local(
    "faiss_index_hp",
    embeddings
)

load INSTRUCTOR_Transformer
max_seq_length  512
CPU times: user 310 ms, sys: 117 ms, total: 427 ms
Wall time: 461 ms


In [22]:
### test if vector DB was loaded correctly
results = vectordb.similarity_search('error detection')
results

[Document(page_content='When data is transmitted, there is always a risk that it may be corrupted, lost or  even gained. Errors can occur during data transmission due to:  interference (all types of cable can suffer from electrical interference, which  can cause data to be corrupted or even lost) problems during packet switching (this can lead to data loss – or it is even possible to gain data!)  skewing of data (this occurs during parallel data transmission and can cause data corruption if the bits arrive out of synchronisation). Checking for errors is important since computers are unable to understand text, for example, if the words are not recognised by its built-in dictionary. Look at the following example of some corrupted text:   Whilst you probably had little problem understanding this text, a computer', metadata={'source': 'Cambridge IGCSE and O Level Computer Science.pdf', 'page': 65}),
 Document(page_content='of the error and are usually automatically generated by the compute

In [23]:
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [25]:
from langchain.memory.vectorstore import VectorStoreRetriever


retriever : VectorStoreRetriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})



In [47]:
query = 'What is parity check'

docs = retriever.get_relevant_documents(query)
merged_context = ''
for doc in docs:
    merged_context = merged_context + ' ' + doc.page_content

final_prompt = prompt_template.format(context=merged_context, question=query)
print(final_prompt)

import pyperclip
pyperclip.copy(final_prompt)


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

 Parity checking is one method used to check whether data has been changed or  corrupted following data transmission. This method is based on the number of 1-bits in a byte of data. The parity can be either called EVEN  (that is, an even number of 1-bits in the  byte) or ODD  (that is, an odd number of 1-bits in the byte). One of the bits in  the byte (usually the most significant bit or left-most bit) is reserved for a parity  bit. The parity bit is set according to whether the parity being used is even or  odd. For example, consider the byte: In this example, if the byte is using even parity, then the parity bit needs to be set to 0, since there is already an even number of 1-bits in the byte (four 1-bits). We thus get: In this example, if the byte is using odd parity, then the

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
