In [0]:
%pip install beautifulsoup4 faiss-cpu==1.7.4 langchain==0.1.16 langchain-community==0.0.33 langchain-openai==0.0.8 openai==1.12.0 tiktoken==0.6.0 mlflow==2.12.1 -q


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
import os
import shutil
import tempfile

import requests
from bs4 import BeautifulSoup
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAI, OpenAIEmbeddings

import mlflow

In [0]:
import warnings

# Disable a few less-than-useful UserWarnings from setuptools and pydantic
warnings.filterwarnings("ignore", category=UserWarning)

In [0]:
OPENAI_API_KEY= dbutils.secrets.get(scope= "databricks-azure", key = "OPENAIAPIKEY")

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."

In [0]:
%run ./utils

In [0]:
temporary_directory = tempfile.mkdtemp()
persist_dir = os.path.join(temporary_directory, "faiss_index")
doc_path = os.path.join(temporary_directory, "docs.txt")
url_listings = [
    "https://www.archives.gov/milestone-documents/act-establishing-yellowstone-national-park#transcript",
    "https://www.archives.gov/milestone-documents/sherman-anti-trust-act#transcript",
]

fetch_and_save_documents(url_listings, doc_path)

In [0]:
persist_dir

'/tmp/tmp0vo1pv62/faiss_index'

In [0]:
vector_db = create_faiss_database(doc_path, persist_dir)

### Supported Elements in MLflow LangChain Integration
- LLMChain

- Agents

- RetrievalQA

- Retrievers

In [0]:
mlflow.set_experiment("/Users/olonok@hotmail.com/LLMOPS/Legal RAG")

retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=vector_db.as_retriever())


# Log the retrievalQA chain
def load_retriever(persist_directory):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.load_local(
        persist_directory,
        embeddings,
        allow_dangerous_deserialization=True,  # This is required to load the index from MLflow
    )
    return vectorstore.as_retriever()


with mlflow.start_run() as run:
    model_info = mlflow.langchain.log_model(
        retrievalQA,
        artifact_path="retrieval_qa",
        loader_fn=load_retriever,
        persist_dir=persist_dir,
    )

Uploading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2024/06/18 11:04:50 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [0]:
run.to_dictionary()

{'info': {'artifact_uri': 'dbfs:/databricks/mlflow-tracking/2057591202958809/10cb646d8dd54976b3aff2eacc2fe036/artifacts',
  'end_time': None,
  'experiment_id': '2057591202958809',
  'lifecycle_stage': 'active',
  'run_id': '10cb646d8dd54976b3aff2eacc2fe036',
  'run_name': 'loud-eel-888',
  'run_uuid': '10cb646d8dd54976b3aff2eacc2fe036',
  'start_time': 1718699709468,
  'status': 'RUNNING',
  'user_id': ''},
 'data': {'metrics': {},
  'params': {},
  'tags': {'mlflow.databricks.cluster.id': '0617-210938-tuxu3pur',
   'mlflow.databricks.notebook.commandID': '1409828623599537264_9020663491274508607_e4edd8b86e0d43afbdb3692d64101d74',
   'mlflow.databricks.notebookID': '2057591202958785',
   'mlflow.databricks.notebookPath': '/Users/olonok@hotmail.com/LLMOPS/LLMOPS Langchain',
   'mlflow.databricks.webappURL': 'https://ukwest.azuredatabricks.net',
   'mlflow.databricks.workspaceID': '1286930193882465',
   'mlflow.databricks.workspaceURL': 'adb-1286930193882465.5.azuredatabricks.net',
   'm

In [0]:
model_info.model_uri

'runs:/a225bae6dfd64a9cb6355647ce38ae75/retrieval_qa'

In [0]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2024/06/18 11:08:35 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [0]:
def print_formatted_response(response_list, max_line_length=80):
    """
    Formats and prints responses with a maximum line length for better readability.

    Args:
    response_list (list): A list of strings representing responses.
    max_line_length (int): Maximum number of characters in a line. Defaults to 80.
    """
    for response in response_list:
        words = response.split()
        line = ""
        for word in words:
            if len(line) + len(word) + 1 <= max_line_length:
                line += word + " "
            else:
                print(line)
                line = word + " "
        print(line)

In [0]:
answer1 = loaded_model.predict([{"query": "What does the document say about trespassers?"}])

print_formatted_response(answer1)

  warn_deprecated(


The document states that anyone who settles, occupies, or trespasses on the 
land reserved for the public park near the headwaters of the Yellowstone River 
will be considered a trespasser and will be removed. 


In [0]:
# Define the name for the model in the Model Registry.
# We filter out some special characters which cannot be used in model names.
user= "olonok@hotmail.com"
model_name = f"legalRAG - {user}"
model_name = model_name.replace("/", "_").replace(".", "_").replace(":", "_")
print(model_name)

legalRAG - olonok@hotmail_com


# Register Model

In [0]:

model_info.model_uri

'runs:/a225bae6dfd64a9cb6355647ce38ae75/retrieval_qa'

In [0]:
# Register a new model under the given name, or a new model version if the name exists already.
mlflow.register_model(model_uri=model_info.model_uri, name=model_name)

Registered model 'legalRAG - olonok@hotmail_com' already exists. Creating a new version of this model...
2024/06/18 11:10:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: legalRAG - olonok@hotmail_com, version 3
Created version '3' of model 'legalRAG - olonok@hotmail_com'.


<ModelVersion: aliases=[], creation_timestamp=1718709025358, current_stage='None', description='', last_updated_timestamp=1718709025358, name='legalRAG - olonok@hotmail_com', run_id='a225bae6dfd64a9cb6355647ce38ae75', run_link='', source='dbfs:/databricks/mlflow-tracking/2057591202958809/a225bae6dfd64a9cb6355647ce38ae75/artifacts/retrieval_qa', status='PENDING_REGISTRATION', status_message='', tags={}, user_id='1491868126462402', version='3'>

# Test Model from Registry

In [0]:
from mlflow import MlflowClient

client = MlflowClient()
client.search_registered_models(filter_string=f"name = '{model_name}'")

[<RegisteredModel: aliases={}, creation_timestamp=1718660612240, description='', last_updated_timestamp=1718709025358, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1718660612632, current_stage='Archived', description='', last_updated_timestamp=1718700396046, name='legalRAG - olonok@hotmail_com', run_id='1ba74217fb6a44438520b8956696b903', run_link='', source='dbfs:/databricks/mlflow-tracking/2057591202958809/1ba74217fb6a44438520b8956696b903/artifacts/retrieval_qa', status='READY', status_message='', tags={}, user_id='olonok@hotmail.com', version='1'>,
  <ModelVersion: aliases=[], creation_timestamp=1718709025358, current_stage='None', description='', last_updated_timestamp=1718709027737, name='legalRAG - olonok@hotmail_com', run_id='a225bae6dfd64a9cb6355647ce38ae75', run_link='', source='dbfs:/databricks/mlflow-tracking/2057591202958809/a225bae6dfd64a9cb6355647ce38ae75/artifacts/retrieval_qa', status='READY', status_message='', tags={}, user_id='olonok@hotmail.com', ve

In [0]:
model_version = 2
dev_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")
dev_model

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2024/06/18 11:14:16 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


mlflow.pyfunc.loaded_model:
  artifact_path: retrieval_qa
  flavor: mlflow.langchain
  run_id: 10cb646d8dd54976b3aff2eacc2fe036

In [0]:
client.transition_model_version_stage(model_name, model_version, "Archived")

  client.transition_model_version_stage(model_name, model_version, "Archived")


<ModelVersion: aliases=[], creation_timestamp=1718700263977, current_stage='Archived', description='', last_updated_timestamp=1718709263980, name='legalRAG - olonok@hotmail_com', run_id='10cb646d8dd54976b3aff2eacc2fe036', run_link='', source='dbfs:/databricks/mlflow-tracking/2057591202958809/10cb646d8dd54976b3aff2eacc2fe036/artifacts/retrieval_qa', status='READY', status_message='', tags={}, user_id='1491868126462402', version='2'>

In [0]:
model_version = 3
dev_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")
dev_model

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2024/06/18 11:16:15 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


mlflow.pyfunc.loaded_model:
  artifact_path: retrieval_qa
  flavor: mlflow.langchain
  run_id: a225bae6dfd64a9cb6355647ce38ae75

In [0]:
client.transition_model_version_stage(model_name, model_version, "Production")

  client.transition_model_version_stage(model_name, model_version, "Production")


<ModelVersion: aliases=[], creation_timestamp=1718709025358, current_stage='Production', description='', last_updated_timestamp=1718709400632, name='legalRAG - olonok@hotmail_com', run_id='a225bae6dfd64a9cb6355647ce38ae75', run_link='', source='dbfs:/databricks/mlflow-tracking/2057591202958809/a225bae6dfd64a9cb6355647ce38ae75/artifacts/retrieval_qa', status='READY', status_message='', tags={}, user_id='1491868126462402', version='3'>

In [0]:
answer3 = dev_model.predict(
    [
        {
            "query": "Can I buy Yellowstone from the Federal Government to set up a buffalo-themed day spa?"
        }
    ]
)

print_formatted_response(answer3)

No, according to the context and the act passed by the 51st Congress of the 
United States in 1890, it is illegal to monopolize any part of trade or 
commerce among the states, including buying land from the federal government 
for commercial purposes. Additionally, the land near the headwaters of the 
Yellowstone River was set aside as a public park for the enjoyment and benefit 
of the people, and any attempts to buy or occupy the land for personal gain 
would be considered trespassing. 


In [0]:
answer4 = dev_model.predict(
    [
        {
            "query": "Can I lease a small parcel of land from the Federal Government for a small "
            "buffalo-themed day spa for visitors to the park?"
        }
    ]
)

print_formatted_response(answer4)

No, according to the context provided, the Secretary of the Interior has the 
exclusive control over the public park and can only grant leases for building 
purposes for terms not exceeding ten years for buildings that accommodate 
visitors. A buffalo-themed day spa would not fall under this category and 
therefore cannot be leased from the Federal Government. 


In [0]:
answer5 = dev_model.predict(
    [
        {
            "query": "Can I lease a small parcel of land from the Federal Government for a small "
            "buffalo-themed day spa and hotel for visitors to stay in and relax at while visiting the park?"
        }
    ]
)
print_formatted_response(answer5)

It is not stated in the context whether or not leases for building purposes are 
allowed in this particular park. It would be best to consult with the Secretary 
of the Interior for further information on the regulations and processes for 
obtaining a lease in the park. 
