In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate

from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


Local LLM (Ollama - Mistral-7b)
---

In [3]:
model_local = ChatOllama(model="mistral",temperature=0)

In [1]:
!pip show lxml

Name: lxml
Version: 5.1.0
Summary: Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API.
Home-page: https://lxml.de/
Author: lxml dev team
Author-email: lxml-dev@lxml.de
License: BSD-3-Clause
Location: C:\Users\Barani\Desktop\local_ollama\ollama_2\Lib\site-packages
Requires: 
Required-by: unstructured


HTML loader
---

In [3]:
loader = BSHTMLLoader("C:\\Users\\Barani\\Desktop\\local_ollama\\html\\cancer_gov_pages.html",open_encoding='utf-8')
data = loader.load()
# print(data)

In [4]:
len(data)

1

Spliting into chunks
---

In [5]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=200)
doc_splits = text_splitter.split_documents(data)
# len(doc_splits)

Created a chunk of size 16803, which is longer than the specified 7500
Created a chunk of size 10873, which is longer than the specified 7500
Created a chunk of size 25747, which is longer than the specified 7500
Created a chunk of size 62502, which is longer than the specified 7500
Created a chunk of size 18729, which is longer than the specified 7500
Created a chunk of size 86418, which is longer than the specified 7500
Created a chunk of size 62897, which is longer than the specified 7500
Created a chunk of size 31533, which is longer than the specified 7500
Created a chunk of size 13323, which is longer than the specified 7500
Created a chunk of size 8938, which is longer than the specified 7500
Created a chunk of size 75041, which is longer than the specified 7500
Created a chunk of size 75750, which is longer than the specified 7500
Created a chunk of size 8034, which is longer than the specified 7500
Created a chunk of size 7835, which is longer than the specified 7500
Created a

In [6]:
len(doc_splits)

5736

Converting into embeddings and store into ChromaDB
---

Save to disk

In [10]:
# vectorstore = Chroma.from_documents(
#             documents=doc_splits,
#             collection_name="rag-chroma",
#             embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
#             persist_directory="./chroma"
#         )
# vectorstore.persist()
# retriever = vectorstore.as_retriever()

Load from disk

In [11]:
embedding_function=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),

In [12]:
vectorstore = Chroma(persist_directory="C:\\Users\\Barani\\Desktop\\local_ollama\\chroma",embedding_function=embedding_function)
# retriever = vectorstore.as_retriever()

In [13]:
query="what is cancer"

In [15]:
# docs = vectorstore.similarity_search(query)

In [10]:
print(vectorstore._collection.count())

0


In [40]:
type(retriever)

langchain_core.vectorstores.VectorStoreRetriever

In [41]:
print(dir(retriever))

['Config', 'InputType', 'OutputType', '__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__config__', '__custom_root_type__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__exclude_fields__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_validators__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__include_fields__', '__init__', '__init_subclass__', '__iter__', '__json_encoder__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__orig_bases__', '__parameters__', '__post_root_validators__', '__pre_root_validators__', '__pretty__', '__private_attributes__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_repr__', '__ror__', '__schema_cache__', '__setattr__', '__setstate__', '__signature__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__try_update_forward_refs__', '__validators__', '__weakref__', '_abatch_with_confi

In [33]:
# import chromadb
# persistent_client = chromadb.PersistentClient()
# collection = persistent_client.get_or_create_collection("rag-chroma")
# langchain_chroma = Chroma(
#     client=persistent_client,
#     collection_name="rag-chroma",
#     embedding_function=embedding_function,
# )
# retriever = langchain_chroma.as_retriever()

Before RAG 
---

In [None]:
# 3. Before RAG
print("Before RAG\n")
before_rag_template = "{topic}"
before_rag_prompt = ChatPromptTemplate.from_template(before_rag_template)
before_rag_chain = before_rag_prompt | model_local | StrOutputParser()

In [None]:
print(before_rag_chain.invoke({"topic": "provide me the General number of mayo clinic which is in Arizona?"}))

After RAG
---

In [38]:
# 4. After RAG
print("\n########\nAfter RAG\n")
# after_rag_template = """provide the answer for the question only from this given context:
# {context}
# Question: {question} and only give the answer from the given context
# """
after_rag_template = """Given the context:
{context}

Question: {question}

Please provide all instances of the answer from the given context. If there are multiple instances, list them all.
"""
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
after_rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | after_rag_prompt
    | model_local
    | StrOutputParser()
)



########
After RAG



In [9]:
chunks = []
async for chunk in after_rag_chain.astream("what are the informations are you able to see in the given context and tell in a brief manner?"):
    chunks.append(chunk)
    print(chunk, end="", flush=True)

 The given context is a section of a webpage from the National Cancer Institute (NCI) website about childhood mesothelioma. Here are some key points and pieces of information that can be extracted from it:

* Malignant mesothelioma is a type of cancer where malignant cells form in the thin layer of tissue called the mesothelium, which covers various organs such as the lungs, heart, and abdominal cavity.
* Symptoms of childhood mesothelioma may include chest pain, shortness of breath, coughing, and fluid buildup in the chest or abdomen.
* Diagnosis of childhood mesothelioma involves imaging tests such as X-rays, CT scans, MRI scans, and PET scans, as well as biopsies to confirm the presence of cancer cells.
* Treatment options for childhood mesothelioma include surgery, chemotherapy, radiation therapy, and a combination of these approaches.
* Prognosis (chance of recovery) depends on various factors such as whether the cancer has spread throughout the thin layer of tissue or into organs

In [39]:
chunks = []
async for chunk in after_rag_chain.astream("what are the informations are you able to see in the given context and tell in a brief manner?"):
    chunks.append(chunk)
    print(chunk, end="", flush=True)

AttributeError: 'tuple' object has no attribute 'embed_query'