In [1]:
import os
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
schema = StructType([
    StructField("address", StringType(), True),
    StructField("email_address", StringType(), True),
    StructField("contact_number", StringType(), True)
])

In [5]:
schema

StructType([StructField('address', StringType(), True), StructField('email_address', StringType(), True), StructField('contact_number', StringType(), True)])

In [6]:
df = spark.createDataFrame([], schema)

In [7]:
model_local = ChatOllama(model="mistral")

In [8]:
directory = "C:\\Users\\Barani\\Desktop\\local_ollama\\test\\test_1.html"

In [9]:
loader = BSHTMLLoader(directory, bs_kwargs={"features": "html.parser"})
data = loader.load()
data

[Document(page_content="\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n   Addresses and phone numbers - About Us - Mayo Clinic\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n     This content does not have an English version.\n    \n\n\n\n\n\n     This content does not have an Arabic version.\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          Skip to content\n         \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                          Care at \n Mayo Clinic\n                         \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                  Patient-Centered Care\n                                 \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                  About Mayo Clinic\n                                 \n\n\n\n\n\n\

In [10]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)

doc_splits = text_splitter.split_documents(data)

In [11]:
vectorstore = Chroma.from_documents(
            documents=doc_splits,
            collection_name="rag-chroma",
            embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
        )
retriever = vectorstore.as_retriever()

In [12]:
class format_json(BaseModel):
    university_name : str = Field(description="organization name from the given context")
    address: str = Field(description="address from the given context")
    email_address: str = Field(description="email address from the given context")
    contact_number: str = Field(description="contact number from the given context")

In [13]:
after_rag_template = """Answer the question based only on the following context:
        {context} and only provide these details in this order only and the response will always will be containing these informations only such as address,contact number,email address in a json format
        {format_instructions}
        Question: {question}
        """
parser = JsonOutputParser(pydantic_object=format_json)
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template,partial_variables={"format_instructions": parser.get_format_instructions()},)
after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | model_local
            | parser
        )

In [14]:
json_output = after_rag_chain.invoke("provide the metioned details from the context and make sure you providing them from the given context")

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


OutputParserException: Invalid json output: Based on the provided context, here is the JSON instance of the schema with the required details:
```json
{
  "university_name": "Mayo Clinic",
  "address": "[200 First St. SW, Rochester, MN 55905]",
  "email_address": "[844-544-0036 (toll-free)]",
  "contact_number": "[507-284-2511 (General number), 507-538-3270 (Appointment Office), 844-217-9591 (Insurance and billing) (toll free), 507-284-8884 (International Appointment Office), 507-266-0909 (for international callers)]",
}
```
This JSON instance conforms to the given schema and includes all the required details from the context.