##Pdf Query Using Langchain

In [1]:
!pip install langchain
!pip install openai
!pip install faiss-cpu
!pip install tiktoken



In [2]:


# Now you can use classes and functions from pycryptodome, for example:
# cipher = AES.new(b'key1234567890123', AES.MODE_ECB)
# encrypted_data = cipher.encrypt(b'This is a secret message.')

# Continue using other features of the library as needed.


from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [3]:
import os
os.environ["OPENAI_API_KEY"] = "Use api-key here"


In [23]:
!pip install pdfminer

Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/4.2 MB[0m [31m2.6 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m2.7/4.2 MB[0m [31m39.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.2/4.2 MB[0m [31m51.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140064 sha256=04c69119c0f5bb7efe3d3b1c12ca81cc1878b052a99b

In [4]:
!pip install pdfminer.six



In [5]:
from pdfminer.high_level import extract_text
text = extract_text("soga_2019_report.pdf")

In [7]:
len(text)

86182

In [8]:
text

'STATE OF  \nGLOBAL AIR/2019\n\nA SPECIAL REPORT ON GLOBAL EXPOSURE TO AIR POLLUTION  \nAND ITS DISEASE BURDEN\n\n     The State of Global Air is a collaboration between the Health Effects Institute  \nand the Institute for Health Metrics and Evaluation’s Global Burden of Disease Project.\nThe State of Global Air is a collaboration between the  \nInstitute for Health Metrics and Evaluation’s Global Burden of Disease Project \nCitation: Health Effects Institute. 2019. State of Global Air 2019. Special Report. Boston, MA:Health Effects Institute.\nand the Health Effects Institute.\nISSN 2578-6873   © 2019 Health Effects Institute\n\n\x0cWhat is the State of Global Air? \nThe State of Global Air report brings into one place the latest \ninformation on air quality and health for countries around the \nglobe. It is produced annually by the Health Effects Institute and \nthe Institute for Health Metrics and Evaluation’s Global Burden \nof Disease project as a source of objective, peer-review

In [9]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 50,
    length_function = len,
)
texts = text_splitter.split_text(text)

In [10]:
len(texts)

112

In [12]:
texts[0:1]

['STATE OF  \nGLOBAL AIR/2019\nA SPECIAL REPORT ON GLOBAL EXPOSURE TO AIR POLLUTION  \nAND ITS DISEASE BURDEN\n     The State of Global Air is a collaboration between the Health Effects Institute  \nand the Institute for Health Metrics and Evaluation’s Global Burden of Disease Project.\nThe State of Global Air is a collaboration between the  \nInstitute for Health Metrics and Evaluation’s Global Burden of Disease Project \nCitation: Health Effects Institute. 2019. State of Global Air 2019. Special Report. Boston, MA:Health Effects Institute.\nand the Health Effects Institute.\nISSN 2578-6873   © 2019 Health Effects Institute\n\x0cWhat is the State of Global Air? \nThe State of Global Air report brings into one place the latest \ninformation on air quality and health for countries around the']

In [13]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [14]:
document_search = FAISS.from_texts(texts, embeddings)

In [15]:
document_search


<langchain_community.vectorstores.faiss.FAISS at 0x7f02ad299840>

In [16]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [17]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [23]:
query = "How many total people were exposed to household air pollution in 2017?. Give answer in one line only."
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' 3.6 billion'

In [24]:
query = "Tell me about contributors & funding of this State of Global Air report."
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

"The website was designed by cations and developed by Ezra Klughaupt and Diane Szczesuil at Charles River Web. Anne Frances Johnson provided additional writing support. The project is funded by Bloomberg Philanthropies and the William and Flora Hewlett Foundation. The report is a collaboration between the Health Effects Institute and the Institute for Health Metrics and Evaluation's Global Burden of Disease Project."

In [26]:
query = "Just tell me the name of 4 Institutes who contributed in this project."
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Washington School of Medicine, University of British Columbia, Health Effects Institute, Institute for Health Metrics and Evaluation'