In [1]:
import os
import pandas as pd
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [2]:
model_local = ChatOllama(model="gemma")

In [3]:
df = pd.DataFrame(columns=["university_name", "address", "email_address", "contact_number"])

In [4]:
directory = "C:\\Users\\Barani\\Desktop\\local_ollama\\test"

In [5]:
for filename in os.listdir(directory):
    if filename.endswith(".html"):
        filepath = os.path.join(directory, filename)
        
        # Load the HTML file
        loader = BSHTMLLoader(filepath, bs_kwargs={"features": "html.parser"})
        data = loader.load()
        
        # Split the document
        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=5000, chunk_overlap=100)
        doc_splits = text_splitter.split_documents(data)
        
        # Create the vectorstore
        vectorstore = Chroma.from_documents(
            documents=doc_splits,
            collection_name="rag-chroma",
            embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
        )
        
        # Create the retriever
        retriever = vectorstore.as_retriever()
        
        # Define the Pydantic model
        class format_json(BaseModel):
            university_name : str = Field(description="The name of the university or organization found in the given context.")
            address: str = Field(description="The physical address of the university or organization found in the given context.")
            email_address: str = Field(description="The email address associated with the university or organization found in the given context.")
            contact_number: str = Field(description="The contact number associated with the university or organization found in the given context.")
        
        # Define the prompt template
        parser = JsonOutputParser(pydantic_object=format_json)
        after_rag_template = """Based on the following context:
                {context}

                Please extract and provide the following details in the order listed below. The response should only contain these details directly extracted from the given context. If multiple instances of each detail are found, please include all of them.

                {format_instructions}

                1. University name
                2. Address
                3. Contact number
                4. Email address

                Question: {question}
                """
        after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template,partial_variables={"format_instructions": parser.get_format_instructions()},)
        
        # Define the chain
        after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | model_local
            | parser
        )
        
        # Invoke the chain
        json_output = after_rag_chain.invoke("Extract the informations from the context entities like university name, address, contact number, and email address from the given context.")
        
        # Convert the JSON output to a DataFrame
        dff = pd.json_normalize(json_output)

        # Append the new DataFrame to the main DataFrame
        df = pd.concat([df, dff], ignore_index=True)

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [17]:
df

Unnamed: 0,university_name,address,email_address,contact_number,required,properties.university_name,properties.address,properties.contact_number,properties.email_address
0,[Mayo Clinic],"[Mayo Clinic\n101 3rd Street SW\nRochester, MN...",[info@mayoclinic.org],"[1-800-323-9999, 507-284-2510]",,,,,
1,,,,,"[university_name, address, contact_number, ema...",[Mayo Clinic],"[800 First St SW, Rochester, MN 55905]","[1-800-327-5678, 507-284-2511, 1-800-611-3262]",[info@mayoclinic.org]


In [10]:
print(json_output)

{'university_name': ['Mayo Clinic'], 'address': ['Mayo Clinic\n101 3rd Street SW\nRochester, MN 55905-0024\nUnited States'], 'email_address': ['info@mayoclinic.org'], 'contact_number': ['1-800-323-9999', '507-284-2510']}


In [None]:
df.to_csv("output.csv", index=False)