## üîÆ ‡§¨‡•ã‡§≤  ‡§≠‡§ø‡§°‡•Ç ! 

In [89]:
import cohere
import streamlit as st

co = cohere.Client(st.secrets.COHERE_API_KEY)
# pc = Pinecone(api_key=st.secrets.PINECONE_API_KEY)


### 1. Loading document

In [71]:
with open("C:/Users/mayur dabade/Desktop/Projects/marathi RAG/data/maharaj.txt", encoding='utf-8') as f:
    text = f.read()

print(f"The text has roughly {len(text.split())} words.")

The text has roughly 1030 words.


### 2. Splitting doc into chunks

In [72]:
# For chunking let's use langchain to help us split the text
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [73]:
# Create basic configurations to chunk the text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

# Split the text into chunks with some overlap
chunks_ = text_splitter.create_documents([text])
chunks = [c.page_content for c in chunks_]
print(f"The text has been broken down in {len(chunks)} chunks.")

The text has been broken down in 15 chunks.


### 3. Embed every text chunk


In [74]:
# Because the texts being embedded are the chunks we are searching over, we set the input type as search_doc
model="embed-multilingual-v3.0"
response = co.embed(
    texts= chunks,
    model=model,
    input_type="search_document",
    embedding_types=['float']
)
embeddings = response.embeddings.float
print(f"We just computed {len(embeddings)} embeddings.")

We just computed 15 embeddings.


### Storing id, chunks and embeddings to pinecone

In [75]:
# import os
# from pinecone import Pinecone, ServerlessSpec

# # Initialize Pinecone with your API key
# api_key = os.getenv("PINECONE_API_KEY", "6727bdcd-c680-4056-9398-64bedaaee775")
# pc = Pinecone(api_key=api_key)

# # Create a serverless index
# # Ensure "dimension" matches the dimensions of the vectors you upsert
# pc.create_index(name="products", dimension=len(embeddings[0]), 
#                 spec=ServerlessSpec(cloud='aws', region='us-east-1'))

# # Target the index
# index = pc.Index("products")

# # Prepare the vectors and metadata for upsert
# vectors_to_upsert = []
# for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
#     vector = {
#         "id": f"vector_{i}",
#         "values": embedding,
#         "metadata": {"description": chunk}
#     }
#     vectors_to_upsert.append(vector)

# # Upsert vectors into the Pinecone index
# index.upsert(vectors=vectors_to_upsert)

# print("Vectors successfully upserted into Pinecone!")


### 4. Storing ids and embeddings to pinecone

In [76]:
import os
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone with your API key
pc = Pinecone(api_key=st.secrets.PINECONE_API_KEY)

# Create a serverless index
# Ensure "dimension" matches the dimensions of the vectors you upsert
pc.create_index(name="product", dimension=len(embeddings[0]), 
                spec=ServerlessSpec(cloud='aws', region='us-east-1'))

# Target the index
index = pc.Index("product")

# Prepare the vectors for upsert (only id and embedding)
vectors_to_upsert = []
for i, embedding in enumerate(embeddings):
    vector = {
        "id": f"{i}",
        "values": embedding
    }
    vectors_to_upsert.append(vector)

# Upsert vectors into the Pinecone index
index.upsert(vectors=vectors_to_upsert)

print("Vectors successfully upserted into Pinecone!")


Vectors successfully upserted into Pinecone!


### 5. Given a user query, retrieve the relevant chunks from the vector database


### Define the user question

In [77]:
query = "‡§∏‡§ó‡§≥‡•ç‡§Ø‡§æ‡§§ ‡§ú‡§æ‡§∏‡•ç‡§§ ‡§∂‡•á‡§Ö‡§∞ ‡§Ö‡§∏‡§£‡§æ‡§±‡•ç‡§Ø‡§æ ‡§≠‡§æ‡§∞‡§§‡§æ‡§§‡•Ä‡§≤ ‡§ï‡§Ç‡§™‡§®‡•Ä?"
# query="‡§∏‡•Ä‡§∏‡•Ä‡§Ü‡§Ø‡§®‡•á ‡§Ø‡§æ ‡§Æ‡§∞‡•ç‡§ú‡§∞‡§≤‡§æ ‡§ï‡§æ‡§π‡•Ä ‡§Ö‡§ü‡•Ä‡§Ç‡§∏‡§π ‡§Æ‡§Ç‡§ú‡•Å‡§∞‡•Ä ‡§ï‡§æ ‡§¶‡§ø‡§≤‡•Ä? ‡§Ø‡§æ ‡§Ö‡§ü‡•Ä‡§Ç‡§ö‡§æ ‡§â‡§¶‡•ç‡§¶‡•á‡§∂ ‡§ï‡§æ‡§Ø ‡§Ö‡§∏‡•Ç ‡§∂‡§ï‡§§‡•ã?"
# query = "‡§Æ‡•Å‡§ò‡§≤ ‡§µ ‡§Ü‡§¶‡§ø‡§≤‡§∂‡§æ‡§π‡•Ä ‡§´‡•å‡§ú‡§æ‡§Ç‡§ö‡§æ ‡§Ø‡§∂‡§∏‡•ç‡§µ‡•Ä‡§™‡§£‡•á ‡§∏‡§æ‡§Æ‡§®‡§æ"


### 6. Embed the user question


In [78]:
# Because the text being embedded is the search query, we set the input type as search_query
response = co.embed(
    texts=[query],
    model=model,
    input_type="search_query",
    embedding_types=['float']
)
query_embedding = response.embeddings.float[0]
# print("query_embedding: ", query_embedding)

### 7. Retrieve the most relevant chunks from the vector database

We use cosine similarity to find the most similar chunks

In [79]:
index = pc.Index("product")

query_results1 = index.query(
    # namespace="example-namespace1",
    vector=query_embedding,
    top_k=3,
    include_values=True
)
# query_results1

# Extracting id and score
result = [{'id': match['id'], 'score': match['score']} for match in query_results1['matches']]

print(result)

[{'id': '4', 'score': 0.63326323}, {'id': '11', 'score': 0.611728072}, {'id': '1', 'score': 0.609839082}]


In [80]:
ids = [int(item['id']) for item in result]
print(ids)

for i in ids:
    print("\n",chunks[i])

[4, 11, 1]

 ‡§Ø‡§æ ‡§Ø‡§æ ‡§ï‡§Ç‡§™‡§®‡•Ä‡§§ ‡§∏‡§∞‡•ç‡§µ‡§æ‡§ß‡§ø‡§ï ‡§µ‡§∞‡•ç‡§ö‡§∏‡•ç‡§µ ‡§π‡•á ‡§∞‡§ø‡§≤‡§æ‡§Ø‡§®‡•ç‡§∏ ‡§Ö‡§∏‡§£‡§æ‡§∞ ‡§Ü‡§π‡•á ‡§Ø‡§æ ‡§ï‡§Ç‡§™‡§®‡•Ä‡§Æ‡§ß‡•ç‡§Ø‡•á ‡§°‡§ø‡§ú‡§®‡•Ä ‡§ï‡§° 3684% ‡§Ç‡§ö‡•Ä ‡§≠‡§æ‡§ó‡•Ä‡§¶‡§æ‡§∞‡•Ä ‡§Ö‡§∏‡•á‡§≤ ‡§§‡§∞ ‡§â‡§∞‡§≤‡•á‡§≤‡§æ 75% ‡§µ‡§æ‡§ü‡§æ ‡§π‡§æ ‡§∏‡•ç‡§ü‡§æ‡§∞ ‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ‡§ö‡•á ‡§∏‡•Ä‡§à‡§ì ‡§â‡§¶‡§Ø ‡§∂‡§Ç‡§ï‡§∞ ‡§Ü‡§£‡§ø ‡§ú‡•á‡§Æ‡•ç‡§∏ ‡§Æ‡•Å‡§∞‡§¶‡•ã‡§ú ‡§Ø‡§æ‡§Ç‡§ö‡•ç‡§Ø‡§æ ‡§¨‡•ã‡§ß‡•Ä ‡§ü‡•ç‡§∞‡•Ä ‡§Ø‡§æ ‡§ú‡•â‡§à‡§Ç‡§ü ‡§µ‡•ç‡§π‡•á‡§Ç‡§ö‡§∞ ‡§ï‡§°‡•á ‡§Ö‡§∏‡§£‡§æ‡§∞ ‡§Ü‡§π‡•á ‡§Æ‡§æ‡§§‡•ç‡§∞ ‡§Ø‡§æ ‡§è‡§ï‡§§‡•ç‡§∞‡•Ä‡§ï‡§∞‡§£‡§æ‡§∏‡§æ‡§†‡•Ä ‡§ï‡•ã‡§∞‡•ç‡§ü‡§æ‡§¶‡•ç‡§µ‡§æ‡§∞‡•á ‡§Æ‡§æ‡§®‡•ç‡§Ø‡§§‡§æ ‡§™‡•ç‡§∞‡§æ‡§™‡•ç‡§§ ‡§∏‡§Ç‡§∏‡•ç‡§•‡•á‡§ö‡•Ä ‡§™‡§∞‡§µ‡§æ‡§®‡§ó‡•Ä ‡§Ö‡§∏‡§£‡§Ç ‡§ó‡§∞‡§ú‡•á‡§ö‡§Ç ‡§Ü‡§π‡•á ‡§Ø‡§æ ‡§™‡§∞‡§µ‡§æ‡§®‡§ó‡•Ä ‡§®‡§Ç‡§§‡§∞‡§ö ‡§Ø‡§æ ‡§¶‡•ã‡§® ‡§ï‡§Ç‡§™‡§®‡•ç‡§Ø‡§æ ‡§è‡§ï‡§§‡•ç‡§∞ ‡§Ø‡•á‡§ä ‡§∂‡§ï‡§§‡§æ‡§§ ‡§Ü‡§§‡§æ ‡§Ø‡§æ ‡§°‡•Ä‡§≤ ‡§∏‡§æ‡§†‡•Ä ‡§≠‡§æ‡§∞‡§§‡§æ‡§§‡•Ä‡§≤ ‡§ï‡•â

### Matching Chunks for the user query

In [81]:
context = "\n\n".join(chunks[i] for i in ids)
# print(context)

### 8. designing final answer

In [82]:


template = f"""Use the following pieces of context to answer the user question. This context retrieved from a knowledge base and you should use only the facts from the context to answer.
Your answer must be based on the context. If the context not contain the answer, just say that 'I don't know', don't try to make up an answer, use the context.
Don't address the context directly, but use it to answer the user question like it's your own knowledge.
Use three sentences maximum. answer should be in Marathi.

Context:
{context}

Question: {query}
"""


In [85]:
import getpass
import os
from langchain_google_genai import ChatGoogleGenerativeAI
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass(st.secrets.GOOGLE_API_KEY)

llm = ChatGoogleGenerativeAI(model="gemini-pro")
result = llm.invoke(template)
print(result.content)

‡§∏‡§ó‡§≥‡•ç‡§Ø‡§æ‡§§ ‡§ú‡§æ‡§∏‡•ç‡§§ ‡§∂‡•á‡§Ö‡§∞ ‡§Ö‡§∏‡§£‡§æ‡§∞‡•Ä ‡§≠‡§æ‡§∞‡§§‡•Ä‡§Ø ‡§ï‡§Ç‡§™‡§®‡•Ä ‡§π‡•Ä ‡§∞‡§ø‡§≤‡§æ‡§Ø‡§®‡•ç‡§∏ ‡§Ü‡§π‡•á.


In [84]:
import numpy as np

# Create a NumPy array
arr = np.array(chunks)

# Save the array to a .npy file
np.save('my_array.npy', arr)


In [88]:
import numpy as np

# Load the array from the .npy file
arr = np.load('my_array.npy')

# Verify the loaded array
print(len(arr))


15
