In [1]:
import os
import tempfile

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

import mlflow

assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."




* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
%pip install faiss-cpu

[33mDEPRECATION: nb-black 1.0.7 has a non-standard dependency specifier black>='19.3'; python_version >= "3.6". pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of nb-black or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: pytorch-lightning 1.6.3 has a non-standard dependency specifier torch>=1.8.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39

In [3]:
import requests
from bs4 import BeautifulSoup

def fetch_federal_document(url, div_class):
    """
    Scrapes the transcript of the Act Establishing Yellowstone National Park from the given URL.

    Args:
    url (str): URL of the webpage to scrape.

    Returns:
    str: The transcript text of the Act.
    """
    # Sending a request to the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Parsing the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Finding the transcript section by its HTML structure
        transcript_section = soup.find('div', class_=div_class)
        if transcript_section:
            transcript_text = transcript_section.get_text(separator='\n', strip=True)
            return transcript_text
        else:
            return "Transcript section not found."
    else:
        return f"Failed to retrieve the webpage. Status code: {response.status_code}"

In [4]:
url_listings = [
    "https://www.archives.gov/milestone-documents/act-establishing-yellowstone-national-park#transcript",
    "https://www.archives.gov/milestone-documents/sherman-anti-trust-act#transcript"
]

with tempfile.TemporaryDirectory() as tmp_dir:
    doc_path = os.path.join(tmp_dir, "docs.txt")
    persist_dir = os.path.join(tmp_dir, "faiss_index")

    for url in url_listings:
        # Fetching the document
        document = fetch_federal_document(url, "col-sm-9")
        # Saving the document to a temporary file
        with open(doc_path, "a") as f:
            f.write(document)
    
    loader = TextLoader(doc_path)

    raw_docs = loader.load()
    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=10)
    docs = splitter.split_documents(raw_docs)
    embeddings = OpenAIEmbeddings()
    db = FAISS.from_documents(docs, embeddings)
    db.save_local(persist_dir)

    retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=db.as_retriever())

    # Log the retrievalQA chain
    def load_retriever(persist_directory):
        embeddings = OpenAIEmbeddings()
        vectorstore = FAISS.load_local(persist_directory, embeddings)
        return vectorstore.as_retriever()

    with mlflow.start_run() as run:
        model_info = mlflow.langchain.log_model(
            retrievalQA,
            artifact_path="retrieval_qa",
            loader_fn=load_retriever,
            persist_dir=persist_dir,
        )





In [5]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

In [6]:
answer1 = loaded_model.predict([{"query":"What does the document say about trespassers?"}])

print(answer1)

[' Section 1 of the document states that all persons who shall locate or settle upon or occupy the land described in the document, or any part thereof, except as provided, shall be considered trespassers and removed therefrom.']


In [16]:
answer2 = loaded_model.predict([{"query":"What is a bridle-path and can I use one at Yellowstone?"}])

print(answer2)

[' A bridle-path is a path designed for people to ride horses on. The Secretary of the Interior may grant leases for building purposes in the Yellowstone Park that may include the construction of roads and bridle-paths. So, it is possible that you could use a bridle-path in the park.']


In [8]:
answer3 = loaded_model.predict([{"query":"Can I buy Yellowstone from the Federal Government to set up a buffalo-themed day spa?"}])

print(answer3)

[' No, you cannot buy Yellowstone from the Federal Government. The Forty-Second Congress of the United States of America set aside the land as a public park and it is under the exclusive control of the Secretary of the Interior. The Fifty-first Congress of the United States of America passed a law to protect trade and commerce against unlawful restraints and monopolies.']


In [9]:
answer4 = loaded_model.predict([{"query":"Can I lease a small parcel of land from the Federal Government for a small buffalo-themed day spa for visitors to the park?"}])

print(answer4)

[' No, this would not be allowed under the terms of the 1890 Act.']


In [10]:
answer5 = loaded_model.predict([{"query":"Can I lease a small parcel of land from the Federal Government for a small buffalo-themed day spa and hotel for visitors to stay in and relax at while visiting the park?"}])
print(answer5)

['No, you cannot lease a small parcel of land from the Federal Government for a small buffalo-themed day spa and hotel for visitors to stay in and relax at while visiting the park. Section 2 of the Act to set apart a certain tract of land lying near the headwaters of the Yellowstone River as a public park states that the Secretary of the Interior may in his discretion grant leases for building purposes for terms not exceeding ten years, of small parcels of ground, at such places in said park as shall require the erection of buildings for the accommodation of visitors. However, this Act does not mention anything about leasing land for a day spa and hotel.']


In [11]:
answer6 = loaded_model.predict([{"query":"Can I just go to the park and peacefully enjoy the natural splendor?"}])

print(answer6)

[' Yes, you can go to the park and peacefully enjoy the natural splendor, as long as you follow the rules and regulations set out by the Secretary of the Interior.']


In [12]:
answer7 = loaded_model.predict([{"query":"Can I start a buffalo themed day spa outside of the park and stifle any competition?"}])

print(answer7)

[' No, according to Section 1 of the Act to Protect Trade and Commerce Against Unlawful Restraints and Monopolies, contracts or conspiracies in restraint of trade or commerce among the several States, or with foreign nations, are illegal.']
