In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from tqdm import tqdm
from dotenv import load_dotenv, dotenv_values
load_dotenv()


llm = ChatGoogleGenerativeAI(
    model='gemini-pro', google_api_key=os.getenv("GEMINI_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
llm.invoke("how MSDHONI made india won 2011 ICC Men's Cricket World cup?")

AIMessage(content="**Role of MS Dhoni in India's 2011 ICC Men's Cricket World Cup Victory:**\n\n**1. Captaincy and Leadership:**\n\n* Dhoni led the Indian team with exceptional composure and strategic decision-making throughout the tournament.\n* He motivated and inspired his players, fostering a winning mindset and team spirit.\n\n**2. Match-Winning Performances:**\n\n* Dhoni played several crucial innings, including an unbeaten 91 in the final against Sri Lanka.\n* His ability to finish games under pressure was instrumental in India's victories.\n\n**3. Wicketkeeping and Run-Outs:**\n\n* Dhoni was an exceptional wicketkeeper, with lightning-fast reflexes and accurate throws.\n* His run-outs, particularly in the semi-final against Pakistan, proved decisive moments.\n\n**4. Strategic Field Placements:**\n\n* Dhoni's astute field placements and bowling changes often outwitted the opposition.\n* He effectively used his fielders to cut off scoring opportunities and create pressure on bats

PDF to chunk

In [3]:
from langchain_community.document_loaders import PyPDFLoader

pdf_path = ("D:\Voice based conversational bot with pdf knowledge\knowledge documents\machine_learning_tutorial.pdf")
loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 800
chunk_overlap = 80
length_function = len
is_separator_regex = False

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=length_function,
    is_separator_regex=is_separator_regex
)

doc_chunks = text_splitter.split_documents(documents)

In [5]:
doc_chunks = doc_chunks[:1]
doc_chunks

[Document(page_content='i', metadata={'source': 'D:\\Voice based conversational bot with pdf knowledge\\knowledge documents\\machine_learning_tutorial.pdf', 'page': 0})]

Google Embeddings

In [6]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_client = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", task_type="retrieval_document", google_api_key="AIzaSyB67yE_ZKd5BKhLluTEi767CzzALRzt-4Q")

In [7]:
doc_chunks

[Document(page_content='i', metadata={'source': 'D:\\Voice based conversational bot with pdf knowledge\\knowledge documents\\machine_learning_tutorial.pdf', 'page': 0})]

In [8]:
docs = []
metadatas = []
ids = []
embeddings = []

for chunk in tqdm(doc_chunks, desc="Embedding document chunks", unit="chunk"):
    # Embed the batch of document chunks
    embedding = embedding_client.embed_query(chunk.page_content)
    # Append the embeddings of the current batch to the final list
    embeddings.append(embedding)

    print("Storing chunks along with metadata in ChromaDB")
    for i, chunk in enumerate(doc_chunks):
        docs.append(chunk.page_content)      # Append document chunk text
        # Append metadata (e.g., page number)
        metadatas.append(chunk.metadata)
        # Append unique identifier for each chunk
        ids.append(f"chunk_{i}")

Embedding document chunks: 100%|██████████| 1/1 [00:01<00:00,  1.85s/chunk]

Storing chunks along with metadata in ChromaDB





In [9]:
len(embedding)

768

# Using ChromaDB

In [10]:
import chromadb

In [7]:
# Define the path to the 'data/chroma' directory relative to the project root
persist_directory = r"D:\Voice based conversational bot with pdf knowledge\data\chroma"

if not os.path.exists(persist_directory):
    os.makedirs(persist_directory)
    print(f"Directory created: {persist_directory}")
else:
    print(f"Directory already exists: {persist_directory}")

Directory already exists: D:\Voice based conversational bot with pdf knowledge\data\chroma


In [32]:
chroma_client = chromadb.PersistentClient(path=str(persist_directory))
print(f"{chroma_client} chroma client made at {str(persist_directory)}")
collection_name = "rag_aiml_data"

<chromadb.api.client.Client object at 0x0000027B10846C40> chroma client made at D:\Voice based conversational bot with pdf knowledge\data\chroma


In [33]:
collection = chroma_client.get_or_create_collection(
    name=collection_name)

print(f"ChromaDB Collection has been build!, collection object : {collection}")

ChromaDB Collection has been build!, collection object : Collection(id=9ce8f329-9852-42d6-83e1-f52917a34e72, name=rag_aiml_data)


In [34]:
len(embeddings)

1

In [35]:
try:
    print(f"___________starting adding embeddings to {collection}___________")
    collection.add(
        documents=docs,
        metadatas=metadatas,
        embeddings=embeddings,
        ids=ids
    )
    print("==============================")
    print("Data is stored in ChromaDB.")
except:
    print("---Failed to add data to chroma collection---")

___________starting adding embeddings to Collection(id=9ce8f329-9852-42d6-83e1-f52917a34e72, name=rag_aiml_data)___________
Data is stored in ChromaDB.


# Using PINECONE

In [11]:
PINECONE_API_KEY = "29dad76e-dc0c-4a43-b876-b5bdd748601c"
PINECONE_API_ENV = "us-east-1"
pinecone_index_name = "voicebot"
print(f"PINECONE_API_KEY : {PINECONE_API_KEY}")
print(f"PINECONE_API_ENV : {PINECONE_API_ENV}")
print(f"PINECONE_INDEX_NAME : {pinecone_index_name}")

PINECONE_API_KEY : 29dad76e-dc0c-4a43-b876-b5bdd748601c
PINECONE_API_ENV : us-east-1
PINECONE_INDEX_NAME : voicebot


In [14]:
from langchain_pinecone import PineconeVectorStore
import os
import joblib
from dotenv import load_dotenv
load_dotenv()

True

Storing Vectors to Remote space of Pinecone

In [16]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
    doc_chunks,
    index_name=pinecone_index_name,
    embedding=embedding_client
)

Querying Pinecone remotely

In [17]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(pinecone_index_name)

In [28]:
user_query = "what is machine learning?"

In [29]:
embeded_query = embedding_client.embed_query(user_query)

In [39]:
retrieved_chunks = index.query(
    vector=embeded_query,
    top_k=10,
    include_metadata=True
)

In [37]:
# metadata_dict = retrieved_chunks['metadata']
# pages = retrieved_chunks['metadata']['page']
# source = retrieved_chunks['metadata']['source']

In [45]:
print(retrieved_chunks['matches'][0]['metadata']['text'])

optimization techniques to find the best solution to your problem.  
 
Next,  let us look at the different categories of Machine Learning.


In [25]:
retrieval_text = []
for i, retrieved_chunk in enumerate(retrieved_chunks.matches):
    print(
        f"PAGE NUMBER {i}----->  {retrieved_chunk['metadata']['text']} \n\n\n")

    retrieval_text.append(retrieved_chunk['metadata']['text'])

PAGE NUMBER 0----->  i 





In [26]:
retrieval_text = ''.join(retrieval_text)
retrieval_text

'i'