# Import Libraries

In [25]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain


from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

from langchain.prompts import PromptTemplate

import os
from dotenv import load_dotenv
load_dotenv()


True

# Loading The LLM (Language Model)


In [26]:
llm = Ollama(model="nuextract", base_url="http://127.0.0.1:11434")

# Setting OpenAI Embeddings

In [27]:
embed_model = OllamaEmbeddings(
    model="nuextract",
    base_url='http://127.0.0.1:11434'
)

# Loading HTML Content

In [28]:
loader = TextLoader("sample.html",encoding='utf-8')
documents = loader.load()

In [29]:
documents

[Document(metadata={'source': 'sample.html'}, page_content='<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <meta name="description" content="Learn about monkeys, their habitats, diets, and interesting facts.">\n    <title>All About Monkeys</title>\n</head>\n<body>\n    <header>\n        <h1>Monkeys: Fascinating Creatures of the Jungle</h1>\n    </header>\n\n    <nav>\n        <ul>\n            <li><a href="#introduction">Introduction</a></li>\n            <li><a href="#habitat">Habitat</a></li>\n            <li><a href="#diet">Diet</a></li>\n            <li><a href="#fun-facts">Fun Facts</a></li>\n        </ul>\n    </nav>\n\n    <section id="introduction">\n        <h2>Introduction</h2>\n        <p>Monkeys are primates found in various parts of the world. Known for their intelligence and playful behavior, they are an integral part of tropical and subtropical ecosystems.</p>\n       

# Splitting Text Into Chunks

In [30]:
text_splitter = CharacterTextSplitter(chunk_size=int(1e6), chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# PgAI Stuff

In [33]:
embeddings = embed_model
connection_string = os.environ.get('POSTGRES_CONNECTION_STRING')
collection_name = "monkey"
vector_store = PGVector.from_documents(
    embedding=embeddings,
    collection_name=collection_name,
    connection_string=os.environ.get('POSTGRES_CONNECTION_STRING'),
    use_jsonb=True,
    documents=docs
)

# Creating a Retriever


In [10]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 100})

# Creating a Retrieval Chain


In [11]:
chain = create_retrieval_chain(combine_docs_chain=llm,retriever=retriever)

#  Retrieval-QA Chat Prompt


In [None]:
retrieval_qa_chat_prompt = PromptTemplate(
    input_variables=["context", "input"],
    template = '''
        %INSTRUCTIONS:
        You are a professional web scraper specializing in extracting HTML elements and their content based on the specified context.

        Please extract only the requested HTML tags and their contents according to the given context. Respond strictly in the following format:
        <tag_name>content</tag_name>
        ...

        Do not include any additional text, explanations, or formatting outside of the specified structure.

        %CONTEXT: 
        The type of tags or content to extract is as follows:
        {context}

        %PROMPT: 
        extract all of the following html tag/tags
        {input}
    '''
)


# Combining Documents


In [13]:
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)

# Final Retrieval Chain


In [14]:
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)    

# Invoking the Retrieval Chain


In [None]:
response = retrieval_chain.invoke({"input": "<nav>content</nav>"}) # Please enter the HTML tag you want to retrieve 

In [18]:
response['answer']

' <nav><ul><li><a href="#introduction">Introduction</a></li><li><a href="#habitat">Habitat</a></li><li><a href="#diet">Diet</a></li><li><a href="#fun-facts">Fun Facts</a></li></ul></nav>\n\n'