In [1]:
import pandas as pd
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.vectorstores import FAISS

In [2]:
model_local = ChatOllama(model="mistral",temperature=0)

In [3]:
df = pd.DataFrame(columns=["address", "email_address", "contact_number"])

In [18]:
directory = "C:\\Users\\Barani\\Desktop\\local_ollama\\llm_testing_urls\\aiims.html"

In [19]:
loader = BSHTMLLoader(directory, bs_kwargs={"features": "html.parser"},open_encoding="utf-8")
data = loader.load()
data

[Document(page_content="\n\n\nAIIMS - All India Institute Of Medical Science\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDefault Theme\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIntranet Access\n@gsuite.aiims.edu\nSkip to main\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nScreen Reader Access\nहिन्दी\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n\n\n\n\nअखिल भारतीय आयुर्विज्ञान संस्थान, नई दिल्ली All India Institute Of Medical Sciences, New Delhi\n\n\n\n\nOPD Appointment\nAIIMS Dashboard\xa0\nORBO Donor Pledge Form\n48th Convocation 2023--- Photos\xa0\n\xa0\n\xa0\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHomeAbout UsIntroductionAbout AIIMSAIIMS Act, Rules & RegulationsAIIMS BillTeachingResearchResearch Output of AIIMSPatient CareDirectorCurrent DirectorFirst DirectorEx-DirectorsManagementOrganisational StructureAdministrationInstitute Body (IB)Governing Body (GB)Fin

In [20]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)

doc_splits = text_splitter.split_documents(data)

In [21]:

embedding = embeddings.OllamaEmbeddings(model='nomic-embed-text')
db = FAISS.from_documents(doc_splits, embedding)
print(db.index.ntotal)


1


In [22]:
db.save_local("db_demos/aiims")

In [23]:
retriever = db.as_retriever()

In [24]:
class format_json(BaseModel):
    university_name : str = Field(description="organization name from the given context")
    address: str = Field(description="address from the given context")
    email_address: str = Field(description="email address from the given context")
    contact_number: str = Field(description="contact number from the given context")

In [25]:
# # 4. After RAG
# print("\n########\nAfter RAG\n")
# after_rag_template = """provide the answer for the question only from this given context:
# {context}
# Question: {question}
# """
# after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
# after_rag_chain = (
#     {"context": retriever, "question": RunnablePassthrough()}
#     | after_rag_prompt
#     | model_local
#     | StrOutputParser()
# )
after_rag_template = """Answer the question based only on the following context:
        {context} and only provide these details in this order only and the response will always will be containing these informations only such as address,contact number,email address in a json format
        {format_instructions}
        Question: {question}
        """
parser = JsonOutputParser(pydantic_object=format_json)
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template,partial_variables={"format_instructions": parser.get_format_instructions()},)
after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | model_local
            | parser
        )

In [26]:
json_output = after_rag_chain.invoke("provide the metioned details from the context and make sure you providing them from the given context")
json_output

{'university_name': 'AIIMS - All India Institute Of Medical Science',
 'address': 'Sector-12, RML Avenue, Anand Parbat, New Delhi, Delhi 110021, India',
 'email_address': 'mail@aiims.edu',
 'contact_number': '+91 11 2658 8500'}

In [27]:
dff = pd.json_normalize(json_output)
dff

Unnamed: 0,university_name,address,email_address,contact_number
0,Makerere University College Of Health Sciences,"Office of the Principal,\n Maker...",pr.chs[at]mak.ac.ug or the College Principal d...,+256 41 4530 020
