In [1]:
import os
import pandas as pd
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [53]:
df = pd.DataFrame(columns=["organization_name","address", "email_address", "contact_number"])
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [16]:
model_local = ChatOllama(model="mistral")


In [8]:
directory = "C:\\Users\\Barani\\Desktop\\local_ollama\\test\\test_1.html"

In [9]:
loader = BSHTMLLoader(directory, bs_kwargs={"features": "html.parser"})
data = loader.load()
data

[Document(page_content="\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n   Addresses and phone numbers - About Us - Mayo Clinic\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n     This content does not have an English version.\n    \n\n\n\n\n\n     This content does not have an Arabic version.\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          Skip to content\n         \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                          Care at \n Mayo Clinic\n                         \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                  Patient-Centered Care\n                                 \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                  About Mayo Clinic\n                                 \n\n\n\n\n\n\

In [40]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=100)

doc_splits = text_splitter.split_documents(data)

In [41]:
print(len(doc_splits))

23


In [42]:
vectorstore = Chroma.from_documents(
            documents=doc_splits,
            collection_name="rag-chroma",
            embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
        )
retriever = vectorstore.as_retriever()

In [43]:
class format_json(BaseModel):
    organization_name : str = Field(description="organization name from the given context")
    address: str = Field(description="address from the given context")
    email_address: str = Field(description="email address from the given context")
    contact_number: str = Field(description="contact number from the given context")

In [58]:
after_rag_template = """Answer the question based only on the following context:
        {context} and only provide these details in this order only and the response will always will be containing these informations only such as organization_name,address,contact_number,email_address in a json format and make sure that each entitly you providing make sense i mean the context of each entity has to be checked
        {format_instructions}
        Question: {question}
        """
parser = JsonOutputParser(pydantic_object=format_json)
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template,partial_variables={"format_instructions": parser.get_format_instructions()},)
after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | model_local
            | parser
        )

In [59]:
json_output = after_rag_chain.invoke("provide the organization_name,address,email_address,contact_number from the context and make sure you providing them from the given context")

In [48]:
json_output

{'organization_name': 'Mayo Clinic',
 'address': 'Mayo Clinic Hospital, Methodist Campus\n201 W. Center St.\nRochester, MN 55902',
 'email_address': '844-544-0036 (toll-free)',
 'contact_number': '507-255-5123'}

In [54]:
dff = pd.json_normalize(json_output)

In [55]:
dff

Unnamed: 0,organization_name,address,email_address,contact_number
0,Mayo Clinic,"Mayo Clinic Hospital, Methodist Campus\n201 W. Center St.\nRochester, MN 55902",844-544-0036 (toll-free),507-255-5123


In [56]:
df = pd.concat([df, dff], ignore_index=True)

In [57]:
df

Unnamed: 0,organization_name,address,email_address,contact_number
0,Mayo Clinic,"Mayo Clinic Hospital, Methodist Campus\n201 W. Center St.\nRochester, MN 55902",844-544-0036 (toll-free),507-255-5123


In [9]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate

from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

model_local = ChatOllama(model="mistral",temperature=0)

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("C:\\Users\\Barani\\Desktop\\local_ollama\\pdfs\\ACC_Data_merged.pdf")
pages = loader.load()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000, chunk_overlap=200)
doc_splits = text_splitter.split_documents(pages)
print(doc_splits[2])

page_content="American Cancer Society cancer.org | 1.800.227.2345 ____________________________________________________________________________________\ntimes, there is no obvious cause.\nWhat is the cancer stage?\xa0\xa0\nWhen a cancer is found, tests are done to see how big the cancer is and whether it has\nspread from where it started. This is called the cancer's stage2.\nA lower stage (such as a stage 1 or 2) means that the cancer has not spread very\nmuch. A higher number (such as a stage 3 or 4) means it has spread more. Stage 4 is\nthe highest stage.\nThe stage of the cancer is very important in choosing the best treatment for a person.\nAsk your doctor about your cancer's stage and what it means for you.\nHow does cancer spread?\xa0\xa0\nCancer can spread from where it started (the primary site) to other parts of the body.\nWhen cancer cells break away from a tumor, they can travel to other areas of the body\nthrough either the bloodstream or the lymph system. Cancer cells that 

In [7]:
len(pages)

168

In [8]:
len(doc_splits)

168