In [1]:
!python3 -m pip install -qU elasticsearch==8.14.0 langchain langchain_community langchain_huggingface langchain-openai \ sentence_transformers openai pypdf python-dotenv

In [2]:
from IPython.display import display
from dotenv import load_dotenv
from getpass import getpass
from urllib.request import urlretrieve
from enum import Enum
import os

In [3]:
class ESConnection(Enum):
    NONE = 0
    BINARY = 1
    DOCKER = 2
    CLOUD = 3

es_connection = ESConnection.DOCKER
print(f"es_connection: {es_connection.name}")

es_connection: DOCKER


In [4]:
from elasticsearch import Elasticsearch

# Load variables from .env file
load_dotenv('.env')

# Set local variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ELASTIC_CLOUD_ID = os.getenv('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = os.getenv('ELASTIC_API_KEY')
ELASTICSEARCH_URL = os.getenv('ELASTICSEARCH_URL')

# Determine the connection data to pass to the client init
if ELASTICSEARCH_URL:
    elasticsearch_client = Elasticsearch(
        hosts=[ELASTICSEARCH_URL],
    )
elif ELASTIC_CLOUD_ID:
    elasticsearch_client = Elasticsearch(
        cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY
    )
else:
    raise ValueError(
        "Please provide either ELASTICSEARCH_URL or ELASTIC_CLOUD_ID and ELASTIC_API_KEY"
    )

print(elasticsearch_client.info())

{'name': 'es01', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ntcsaqKXQRGBVMVrOYCtxQ', 'version': {'number': '8.14.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0', 'build_date': '2024-07-07T22:04:49.882652950Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [5]:
from langchain.document_loaders import PyPDFLoader
from urllib.request import urlretrieve
from os.path import exists

file_exists = exists("./data/usc03@118-22.pdf")
if not file_exists:
    # get the us code pdf on the president and unzip it
    url = "https://uscode.house.gov/download/releasepoints/us/pl/118/22u1/pdf_usc03@118-22u1.zip"
    file_name = "./data/president.pdf.zip"
    urlretrieve(url, file_name)
    !unzip president.pdf.zip
    
# now load the pdf as text and break it into pages
loader = PyPDFLoader("./data/usc03@118-22.pdf")
pages = loader.load_and_split()

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import ElasticsearchStore

# set our embedding model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# instantiate vectore store from documents
esVectorStore = ElasticsearchStore.from_documents(
    pages,
    es_connection=elasticsearch_client,
    index_name="the-president",
    embedding=embeddings
)

# verify the ElasticsearchStore was created
esVectorStore

  from tqdm.autonotebook import tqdm, trange


<langchain_community.vectorstores.elasticsearch.ElasticsearchStore at 0x15cdab9e0>

In [7]:
def showResults(results):
    print("Total results: ", len(results))
    for i in range(len(results)):
        print(results[i])

In [8]:
query = "who succeeds the president"
result = esVectorStore.similarity_search(query=query)

showResults(result)

Total results:  4
page_content='Pub. L. 117–328, §110(b), inserted "under section 15(d)(2)(C)(i)" after "motion to withdraw".
1954—Act Sept. 3, 1954, substituted "chapter" for "subchapter".
§19. Vacancy in offices of both President and Vice President; officers eligible to
act
(a)(1) If, by reason of death, resignation, removal from office, inability, or failure to qualify, there
is neither a President nor Vice President to discharge the powers and duties of the office of President,
then the Speaker of the House of Representatives shall, upon his resignation as Speaker and as
Representative in Congress, act as President.
(2) The same rule shall apply in the case of the death, resignation, removal from office, or inability
of an individual acting as President under this subsection.
(b) If, at the time when under subsection (a) of this section a Speaker is to begin the discharge of
the powers and duties of the office of President, there is no Speaker, or the Speaker fails to qualify as
Ac

In [9]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retriever = esVectorStore.as_retriever(search_kwargs={"k": 3})

template = """Answer the question with the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | ChatOpenAI(openai_api_key=OPENAI_API_KEY)
    | StrOutputParser()
)

q = input("Question: ") or "what is the electoral college?"
print("\n")
reply = chain.invoke(q)
display("Answer: " + reply)

Question:  What are the president's duties?






huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


"Answer: The president's duties include appointing and fixing the pay of employees for the Domestic Policy Staff and the Office of Administration, as well as procuring temporary or intermittent services of experts and consultants as needed. Additionally, the president is authorized to employ individuals in the Office of Administration in accordance with specific provisions of law."