In [1]:
import pandas as pd
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [2]:
model_local = ChatOllama(model="mistral")


In [3]:
df = pd.DataFrame(columns=["address", "email_address", "contact_number"])
df

Unnamed: 0,address,email_address,contact_number


In [15]:
directory = "C:\\Users\\Barani\\Desktop\\local_ollama\\llm_testing_urls\\test_5.html"

In [16]:
loader = BSHTMLLoader(directory, bs_kwargs={"features": "html.parser"})
data = loader.load()
data

[Document(page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n   Contact Us - Wits University\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n           About\n          \n\n\n\n           People/Staff\n          \n\n\n\n           Alumni\n          \n\n\n\n           Library\n          \n\n\n\n           Visit Wits\n          \n\n\n\n           Give\n          \n\n\n\n           Wits100\n          \n\n\n\n\n\n\n\n\n\n\n\n           Homepage\n          \n\n\n\n\n\n\n\n             Study at Wits\n            \n\n\n\n             Students\n            \n\n\n\n             Faculties and Schools\n            \n\n\n\n             Teaching and Learning\n            \n\n\n\n             Research\n            \n\n\n\n             News\n            \n\n\n\n\n\n\n\n\n          Search\n         \n\n\n\n\n\n\n          Search the site\n         \n\n\n\n\n\n\n\n\n\n\n              Search\n      

In [30]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=400, chunk_overlap=100)

doc_splits = text_splitter.split_documents(data)

In [31]:
len(doc_splits)

8

In [32]:
vectorstore = Chroma.from_documents(
            documents=doc_splits,
            collection_name="rag-chroma",
            embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
        )
retriever = vectorstore.as_retriever()

In [33]:
class format_json(BaseModel):
    university_name : str = Field(description="organization name from the given context")
    address: str = Field(description="address from the given context")
    email_address: str = Field(description="email address from the given context")
    contact_number: str = Field(description="contact number from the given context")

In [34]:
after_rag_template = """Answer the question based only on the following context:
        {context} and only provide these details in this order only and the response will always will be containing these informations only such as address,contact number,email address in a json format
        {format_instructions}
        Question: {question}
        """
parser = JsonOutputParser(pydantic_object=format_json)
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template,partial_variables={"format_instructions": parser.get_format_instructions()},)
after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | model_local
            | parser
        )

In [35]:
json_output = after_rag_chain.invoke("provide the metioned details like university_name,address,email_address,contact_number from the context and make sure you providing them from the given context")

In [36]:
json_output

{'university_name': 'University of the Witwatersrand',
 'address': 'School of Chemistry (Humphrey Raikes Building) | University of the Witwatersrand | Jorissen Street | Braamfontein 2000 | Johannesburg, South Africa',
 'email_address': '[Andreas.Lemmerer@wits.ac.za, Charles.DeKoning@wits.ac.za, Marc.humphries@wits.ac.za]',
 'contact_number': 'Not provided in the context'}