In [1]:
from langchain_community.vectorstores import Chroma
import chromadb
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
import os

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever



In [2]:
# Creating a collection in local disk Using chromadb

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
persistent_client = chromadb.PersistentClient(path="Data/chroma")
collection = persistent_client.get_or_create_collection("test_collection")
# collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
persistent_client

<chromadb.api.client.Client at 0x1902b7d2700>

In [None]:
loader = CSVLoader(r"csv\fb.csv", encoding="latin-1")
db = Chroma.from_documents(documents=loader.load(), collection_name="test_collection",embedding=embedding_function, persist_directory="chroma")
query = "Can exercise help with depression?"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)


In [None]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x1b8d64716d0>

### Multiple Collections

In [5]:

def get_file_paths_recursively(folder_path):
    file_paths = []
    for root, directories, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

def Vdb_loader(file_paths):
    for i in range(len(file_paths)):
        loader = CSVLoader(file_path=file_paths[i], encoding="latin-1")
        db = Chroma.from_documents(documents=loader.load(), embedding=embedding_function, collection_name="test_collection", persist_directory="Data/chroma") # pars to imclude (docs, emb_fun, col_name, direct_path)
    return db


folder_path = "Data\csv"  # Replace with your actual folder path
file_paths = get_file_paths_recursively(folder_path)

# print(data)

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

#loaded all the files
db = Vdb_loader(file_paths)

# db.add_collection("pdf")

# query it
query = "Can people with mental illness recover?"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

: 4
Questions_cleaned: Can people with mental illness recover?
Answers_cleaned: When healing from mental illness, early identification and treatment are of vital importance. Based on the nature of the illness, there are a range of effective treatments available. For any type of treatment, it is essential that the person affected is proactive and fully engaged in their own recovery process.
Many people with mental illnesses who are diagnosed and treated respond well, although some might experience a return of symptoms. Even in such cases, with careful monitoring and management of the disorder, it is still quite possible to live a fulfilled and productive life.


In [7]:
loader = PyMuPDFLoader("Data\PDFs\DepressionGuide-web.pdf")
documents  = loader.load()

In [8]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)


In [9]:
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=embedding_function, persist_directory="Data/chroma")

In [10]:
store = InMemoryStore()
def instantiate_rag():
    rag_retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        parent_splitter=parent_splitter,
    )
    rag_retriever.add_documents(documents)
    return rag_retriever

In [11]:
instantiate_rag()

ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001902C4D0790>, docstore=<langchain.storage.in_memory.InMemoryBaseStore object at 0x000001902E3CC640>, child_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x0000019002D87B20>, parent_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x0000019002D87190>)

In [19]:
persistent_client = chromadb.PersistentClient(path="Data/chroma")
persistent_client.get_collection_names()
# collection = persistent_client.get_collection(name="test_collection")
# print(collection.count())



AttributeError: 'Client' object has no attribute 'get_collection_names'

In [None]:
#m Loading the VDB into variable using langchain
langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="test_collection",
    embedding_function=embedding_function,
)

print("There are", langchain_chroma._collection.count(), "in the collection")

In [None]:
persistent_client.delete_collection(name="split_parents") # Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible


## Retrival only

In [22]:
persistent_client = chromadb.PersistentClient(path="Data/chroma")


In [23]:
db1 = Chroma(
    client=persistent_client,
    collection_name="test_collection",
    embedding_function=embedding_function,
)


In [24]:
db2 = Chroma(
    client=persistent_client,
    collection_name="split_parents",
    embedding_function=embedding_function,
)


In [25]:
db1.similarity_search("Can people with mental illness recover?")

[Document(page_content=': 4\nQuestions_cleaned: Can people with mental illness recover?\nAnswers_cleaned: When healing from mental illness, early identification and treatment are of vital importance. Based on the nature of the illness, there are a range of effective treatments available. For any type of treatment, it is essential that the person affected is proactive and fully engaged in their own recovery process.\nMany people with mental illnesses who are diagnosed and treated respond well, although some might experience a return of symptoms. Even in such cases, with careful monitoring and management of the disorder, it is still quite possible to live a fulfilled and productive life.', metadata={'row': 4, 'source': 'Data\\csv\\dp.csv'}),
 Document(page_content=': 4\nQuestions_cleaned: Can people with mental illness recover?\nAnswers_cleaned: When healing from mental illness, early identification and treatment are of vital importance. Based on the nature of the illness, there are a ra

In [26]:
db2.similarity_search("Can people with mental illness recover?")


[Document(page_content='is experiencing symptoms of depression, \nthey need to also seek treatment.\nResources\n• American Academy of Child & \nAdolescent Psychiatry (AACAP) \nhttps://www.aacap.org/aacap/\nFamilies_and_Youth/Resource_Centers/\nDepression_Resource_Center/Home.aspx\n• National Alliance on Mental Illness (NAMI) \nhttps://www.nami.org/Find-Support/\nFamily-Members-and-Caregivers', metadata={'author': '', 'creationDate': "D:20190521112126-04'00'", 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'doc_id': '145e82e1-2adf-4451-8200-d1e7e23908a0', 'file_path': 'Data\\PDFs\\DepressionGuide-web.pdf', 'format': 'PDF 1.6', 'keywords': '', 'modDate': "D:20190620101312-04'00'", 'page': 15, 'producer': 'Adobe PDF Library 15.0', 'source': 'Data\\PDFs\\DepressionGuide-web.pdf', 'subject': '', 'title': '', 'total_pages': 20, 'trapped': ''}),
 Document(page_content='Cathryn A. Galanter, MD\nJessica M. Jones, MA, LPA\nBeth Kennard, PsyD\nJerry Pavlon-Blum, Representative from Depression and 

In [28]:
ins = [db1,db2]

In [29]:
for i in ins:
    print(i.similarity_search("Can people with mental illness recover?"))


[Document(page_content=': 4\nQuestions_cleaned: Can people with mental illness recover?\nAnswers_cleaned: When healing from mental illness, early identification and treatment are of vital importance. Based on the nature of the illness, there are a range of effective treatments available. For any type of treatment, it is essential that the person affected is proactive and fully engaged in their own recovery process.\nMany people with mental illnesses who are diagnosed and treated respond well, although some might experience a return of symptoms. Even in such cases, with careful monitoring and management of the disorder, it is still quite possible to live a fulfilled and productive life.', metadata={'row': 4, 'source': 'Data\\csv\\dp.csv'}), Document(page_content=': 4\nQuestions_cleaned: Can people with mental illness recover?\nAnswers_cleaned: When healing from mental illness, early identification and treatment are of vital importance. Based on the nature of the illness, there are a ran

## LLM Gen

In [10]:
## All Imports
from langchain.memory import ChatMessageHistory, ConversationSummaryBufferMemory, ConversationBufferMemory
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
from langchain.chains import (
    StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
)
import os
from langchain_community.vectorstores import Chroma
import chromadb
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings



In [11]:
#preparation Data
HUGGINGFACEHUB_API_TOKEN = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

llm = HuggingFaceEndpoint(
    repo_id=repo_id, max_length=128, temperature=0.2, token=HUGGINGFACEHUB_API_TOKEN
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.
                    token was transferred to model_kwargs.
                    Please make sure that token is what you intended.


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\phane\.cache\huggingface\token
Login successful


In [12]:
ch = ChatMessageHistory()
memory = ConversationBufferMemory(
    llm=llm, 
    memory_key="chat_history",
    return_messages=True,
    output_key='answer',
    input_key='question')

In [22]:
vectorstore = Chroma(collection_name="test_collection", persist_directory="Data/chroma", embedding_function=embedding_function )
retriever = vectorstore.as_retriever()

# This controls how the standalone question is generated.
# Should take `chat_history` and `question` as input variables.
template = (
    "Combine the chat history and follow up question into "
    "a standalone question. Chat History: {chat_history}"
    "Follow up question: {question}"
)
prompt = PromptTemplate.from_template(template)
llm_chain = LLMChain(llm=llm, prompt=prompt)
combine_docs_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="context")
# chain = ConversationalRetrievalChain(
#     combine_docs_chain=combine_docs_chain,
#     memory= memory,
#     retriever=retriever,
#     question_generator=llm_chain
# )
retrieval_chain = ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever=retriever,
    memory = memory,
    return_source_documents=False,
    verbose=True,
    #condense_question_prompt=condense_question_prompt,
    # chain_type = "stuff",
#     combine_docs_chain_kwargs={'prompt': qa_prompt} # https://github.com/langchain-ai/langchain/issues/6879
)


ValidationError: 1 validation error for StuffDocumentsChain
__root__
  document_variable_name context was not found in llm_chain input_variables: ['chat_history', 'question'] (type=value_error)