In [1]:
!pip -q install langchain openai tiktoken PyPDF2 faiss-cpu



In [2]:
# add open api key 
import os
os.environ["OPENAI_API_KEY"] = ""

In [4]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 

In [63]:
doc_reader = PdfReader(r'data.pdf')

In [26]:
doc_reader

<PyPDF2._reader.PdfReader at 0x22fde6f1990>

In [64]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [65]:
len(raw_text)

52961

In [66]:
raw_text[:10000]

'HDFC ERGO General Insurance Company Limited\nProspectus\nEnergy, Policy1\nSuitability:\na)This policy covers persons in the age group\n18 years to 65 years. The maximum entryage is restricted upto 65 years.\nb)\nThere is no maximum cover ceasing age inthis policy.c)\nThis Policy offers cover to individualswith T\nype 1 Diabetes, Type 2 Diabetes\nMellitus, Impaired Fasting Glucose (IFG),Impaired Glucose Tolerance (IGT) and/orHypertension.d)\nThe policy will be issued for a period 1 year.\ne)This policy can be issued to an individualonly on individual Sum Insured basis.\nf)\nThere will be no general waiting period of30 days applicable in this product.\nSum Insured: Offered are Rs. 200,000; 300,000; 500,000; 10,00,000; 15,00,000; 20,00,000; 25,00,000; 50,00,000. \nSalient Features & Benefits: \nWe will cover the Medical Expenses for:We will not cover treatment, costs or expenses for*:\n*The following exclusions apply in addition to the waiting periodsand general exclusions.\na.\nIn-Patie

In [67]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, 
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [34]:
len(texts)

68

In [38]:
texts[1]

'We will cover the Medical Expenses for:We will not cover treatment, costs or expenses for*:\n*The following exclusions apply in addition to the waiting periodsand general exclusions.\na.\nIn-Patient T\nreatment Treatment costs where Insured Person has to\nstay in a Hospital for more than 24 hours. This includes\n• Hospital room rent or boarding\n• Nursing\n• Intensive Care Unit\n• Medical Practitioners (Fees)\n• Anaesthesia\n• Blood\n• Oxygen\n• Operation theatre\n• Surgical appliances\n• Medicines, drugs & consumables\n• Diagnostic procedures\n• Cost of prosthetic and other devices or equipment if1. Treatment availed outside India\n2. Treatment at a healthcare facility which is NOT\n a Hospital.\n3. Treatment for which hospitalization is not necessary\nb.Pre-Hospitalization Medical \nexpenses for consultations,\ninvestigations and medicines incurred upto 30 days beforeHospitalisation.\nc.\nPost-Hospitalization Medical expenses for consultations,investigations and'

In [68]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [69]:
docsearch = FAISS.from_texts(texts, embeddings)

In [70]:
docsearch.embedding_function

<bound method OpenAIEmbeddings.embed_query of OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False)>

In [71]:
query = "what is amount of sum insured?"
docs = docsearch.similarity_search(query)

In [48]:
len(docs)

4

In [49]:
docs[0]

Document(page_content='Prospectus\nEnergy, Policy2\nh. Restore benefit\nInstant addition of 100% Basic Sum Insured on complete or partialutilization of Your existing Policy Sum Insured and cumulative Bonus(if applicable) during the Policy Year.  The Total amount (Basic suminsured, cumulative bonus and Restore sum insured) will be availableto the insured person for all claims under In-patient Benefit during thecurrent Policy Year  and subject to the condition that single claim  ina Policy Year cannot exceed the sum of Basic Sum Insured and thecumulative bonus (if applicable).Conditions for Restore benefit:1.\nThe Restore \nSum Insured can be used for claims made by the\nInsured Person in respect of the benefits stated in Section 1.\n2. The Sum Insured will be restored only once in a Policy Y\near.\n3. If the restored sum insured \nis not utilised in a policy year, it shall not \nbe carried forward to any subsequent policy year.')

In [50]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [72]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff") 

In [52]:
# check the prompt
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

In [73]:
query = "what is amount of sum insured?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The sum insured depends on which plan you have chosen. For the Gold Plan, the sum insured ranges from 2.00 lakhs to 50.00 lakhs.'

In [54]:
query = "what are the claims that are not covered in Ambulance Cover?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Claims which have not been admitted under In-patient Treatment and Day care procedures, ambulance services of non-registered healthcare or ambulance service provider.'

In [55]:
query = "what are the different periods in Silver plan ?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The Silver Plan (Base Module) includes "No Copayment" and "20% Copayment" periods.'

In [74]:
query = "what are the readings and corresponding points in Blood Pressure - Annual Examination in Plan ?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The readings and corresponding points in Blood Pressure - Annual Examination in Plan 3 are: 110-120/70-80 5, 121-139/80-89 2, 140-150/90-100 1.'

In [75]:
query = "what are the readings and corresponding points in Blood Pressure - Annual Examination in Gold Plan ?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Blood Pressure - Annual Examination: 110-120/70-80 5, 121-139/80-89 2, 140-150/90-100 1.'

In [77]:
query = "what are the Timelines for submitting the Medical Check-up reports in Gold Plan?"
docs = docsearch.similarity_search(query, k = 6)
chain.run(input_documents=docs, question=query)

' The Timelines for submitting the Medical Check-up reports in Gold Plan are Half yearly check-up 4th or 5th months of the policy year and Annual check-up 8th or 9th months of the policy year.'

In [102]:
# query = "what are the Timelines for submitting the Medical Check-up reports in Gold Plan?"
# docs = docsearch.similarity_search(query, k = 20)
# chain.run(input_documents=docs, question=query)

### Map reduce chain type

In [83]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="map_rerank",
                      return_intermediate_steps=True
                      ) 

query = "what are the readings and corresponding points in Blood Pressure - Annual Examination in Plan ?"
docs = docsearch.similarity_search(query,k=5)
results = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
results

{'intermediate_steps': [{'answer': ' 110-120/70-80 5, 121-139/80-89 2, 140-150/90-100 1',
   'score': '100'},
  {'answer': ' 110-120/70-80 5; 121-139/80-89 2; 140-150/90-100 1',
   'score': '100'},
  {'answer': ' This document does not answer the question.', 'score': '0'},
  {'answer': 'Severe Hypertension: 180-210/110-119: 0  ', 'score': '100'},
  {'answer': ' The readings and corresponding points in Blood Pressure - Annual Examination in the Gold Plan are HbA1c, SMA 12, Total Cholesterol : HDL Cholesterol, ECG, Blood pressure Monitoring, BMI, Diabetologist Consultation/ General Practitioner.',
   'score': '100'}],
 'output_text': ' 110-120/70-80 5, 121-139/80-89 2, 140-150/90-100 1'}

In [80]:
results['output_text']

'140-150/90-100 1'

In [81]:
results['intermediate_steps']

[{'answer': '140-150/90-100 1', 'score': '100'},
 {'answer': ' 110-120/70-80 5, 121-139/80-89 2, 140-150/90-100 1',
  'score': '100'},
 {'answer': ' This document does not answer the question.', 'score': '0'},
 {'answer': ' According to the above context, the reading points in Blood Pressure - Annual Examination in Plan is not specified.',
  'score': '0'},
 {'answer': ' This document does not answer the question', 'score': '0'},
 {'answer': ' This document does not answer the question.', 'score': '0'},
 {'answer': ' The readings and corresponding points in Blood Pressure - Annual Examination in Plan are HbA1c, SMA 12, Total Cholesterol : HDL Cholesterol, ECG, Blood pressure Monitoring, BMI, Diabetologist Consultation/ General Practitioner.',
  'score': '80'},
 {'answer': ' This document does not answer the question.', 'score': '0'},
 {'answer': ' This document does not answer the question.', 'score': '0'},
 {'answer': ' This document does not answer the question.', 'score': '0'}]

In [82]:
# check the prompt
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nIn addition to giving an answer, also return a score of how fully it answered the user's question. This should be in the following format:\n\nQuestion: [question here]\nHelpful Answer: [answer here]\nScore: [score between 0 and 100]\n\nHow to determine the score:\n- Higher is a better answer\n- Better responds fully to the asked question, with sufficient level of detail\n- If you do not know the answer based on the context, that should be a score of 0\n- Don't be overconfident!\n\nExample #1\n\nContext:\n---------\nApples are red\n---------\nQuestion: what color are apples?\nHelpful Answer: red\nScore: 100\n\nExample #2\n\nContext:\n---------\nit was night and the witness forgot his glasses. he was not sure if it was a sports car or an suv\n---------\nQuestion: what type was the car?\nHelpful Answer: a sports car or an su