## Ingesting PDF

In [2]:
# add this import for running in jupyter notebook
import nest_asyncio

nest_asyncio.apply()

In [3]:
from langchain_community.document_loaders.mongodb import MongodbLoader

In [5]:
! pip install motor

Defaulting to user installation because normal site-packages is not writeable
Collecting motor
  Downloading motor-3.5.0-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 KB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting pymongo<5,>=4.5
  Downloading pymongo-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pymongo, motor
Successfully installed motor-3.5.0 pymongo-4.8.0


In [6]:
loader = MongodbLoader(
    connection_string="mongodb+srv://root:root@cluster0.wcrq9us.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
    db_name="test",
    collection_name="users"
)

In [15]:
# Preview first page
data = loader.load()

len(data)

12

In [16]:
data[0]

Document(page_content="{'_id': ObjectId('65fc24253e3b709ab388b9d2'), 'username': 'muditmalviya', 'email': 'malviyamudit7@gmail.com', 'password': '$2b$10$R2K0oktHoxeLGbWUIpZrveeUZi2FrBHTWcVl03QCu49rIEZWgKxNO', 'phoneno': 9011813344.0, 'role': 'admin', 'isAvailable': True, '__v': 0, 'rewards': 2}", metadata={'database': 'test', 'collection': 'users'})

## Vector Embeddings

In [10]:
!ollama pull nomic-embed-text

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling ce4a164fc046... 100% ▕████████████████▏   17 B                         
pulling 31df23ea7daa... 100% ▕████████████████▏  420 B                         
verifying sha256 digest ⠋ [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100%

In [11]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED       
mistral:latest         	2ae6f6dd7a3d	4.1 GB	15 hours ago  	
nomic-embed-text:latest	0a109f422b47	274 MB	37 seconds ago	


In [12]:
%pip install --q chromadb
%pip install --q langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [13]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [17]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [18]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|██████████| 12/12 [00:05<00:00,  2.29it/s]


## Retrieval

In [19]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [20]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [21]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [22]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [23]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [24]:
chain.invoke(input(""))

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.79it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  9.20it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  8.58it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  5.46it/s]


' Based on the provided context, the following are the usernames in the collection:\n1. sudhanshu\n2. mahndra\n3. gauravpathak\n4. shubmamsingh\n5. harshala\n6. muditmalviya\n7. ayushbhople\n8. ramaspire\n9. keshavkhandelwa'

In [25]:
chain.invoke("User having isAvaliable field as true")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.39it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.19it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  6.05it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.92it/s]


" In the provided context, there are four documents representing users with the role 'technician'. Among them, two users have their 'isAvailable' field set to True. The usernames of these users are 'ramaspire' and 'harshala'. Therefore, the answer to your question is both 'ramaspire' and 'harshala'."

In [26]:
chain.invoke("All User having isAvaliable field as true")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.64it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.12it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  6.80it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  5.16it/s]


" From the given context, the users with 'isAvailable' field set to True are 'mahendra', 'ramaspire' and 'harshala'. Their usernames are respectively 'mahendra', 'ramaspire' and 'harshala'."

In [None]:
# Delete all collections in the db
vector_db.delete_collection()