Importing necessary modules and functions.

Initializing an empty DataFrame with columns “address”, “email_address”, and “contact_number”.

Iterating over each HTML file in a specified directory.

For each HTML file, it loads and splits the data, converts the documents to embeddings, and stores them.

It then retrieves the embeddings and uses a chat model to generate a JSON output containing the required details.

This JSON output is then converted to a DataFrame row and appended to the main DataFrame.

Finally, it prints the DataFrame.

In [2]:
import pandas as pd
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [11]:
model_local = ChatOllama(model="mistral")

In [4]:
df = pd.DataFrame(columns=["address", "email_address", "contact_number"])
df

Unnamed: 0,address,email_address,contact_number


In [6]:
directory = "C:\\Users\\Barani\\Desktop\\local_ollama\\test\\test_1.html"

In [7]:
loader = BSHTMLLoader(directory, bs_kwargs={"features": "html.parser"})
data = loader.load()
data

[Document(page_content="\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n   Addresses and phone numbers - About Us - Mayo Clinic\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n     This content does not have an English version.\n    \n\n\n\n\n\n     This content does not have an Arabic version.\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          Skip to content\n         \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                          Care at \n Mayo Clinic\n                         \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                  Patient-Centered Care\n                                 \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                  About Mayo Clinic\n                                 \n\n\n\n\n\n\

In [8]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)

doc_splits = text_splitter.split_documents(data)

In [9]:
vectorstore = Chroma.from_documents(
            documents=doc_splits,
            collection_name="rag-chroma",
            embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
        )
retriever = vectorstore.as_retriever()

In [10]:
class format_json(BaseModel):
    university_name : str = Field(description="organization name from the given context")
    address: str = Field(description="address from the given context")
    email_address: str = Field(description="email address from the given context")
    contact_number: str = Field(description="contact number from the given context")

In [12]:
after_rag_template = """Answer the question based only on the following context:
        {context} and only provide these details in this order only and the response will always will be containing these informations only such as address,contact number,email address in a json format
        {format_instructions}
        Question: {question}
        """
parser = JsonOutputParser(pydantic_object=format_json)
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template,partial_variables={"format_instructions": parser.get_format_instructions()},)
after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | model_local
            | parser
        )

In [13]:
json_output = after_rag_chain.invoke("provide the metioned details from the context and make sure you providing them from the given context")

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


In [14]:
json_output

{'university_name': 'Mayo Clinic',
 'address': '[800 Broadway, Rochester, MN 55904, \n            Jacksonville Office of Patient Experience\n           \n\n\n             844-953-2000\n            \n\n\n          Minnesota â€” Rochester\n       \n\n\n        Mayo Clinic\n       \n\n\n       200 First St. SW\n       \n       Rochester, MN 55905]',
 'email_address': '[information@mayoclinic.org]',
 'contact_number': '[844-544-0036 (toll-free)]'}

In [21]:
dff = pd.json_normalize(json_output)

In [22]:
dff

Unnamed: 0,university_name,address,email_address,contact_number
0,Mayo Clinic,"[800 Broadway, Rochester, MN 55904, \n ...",[information@mayoclinic.org],[844-544-0036 (toll-free)]


In [23]:
df = pd.concat([df, dff], ignore_index=True)

In [24]:
df

Unnamed: 0,address,email_address,contact_number,university_name
0,"[800 Broadway, Rochester, MN 55904, \n ...",[information@mayoclinic.org],[844-544-0036 (toll-free)],Mayo Clinic
