## Libraries

In [1]:
import pandas as pd
import pymongo
from datasets import load_dataset
from configparser import ConfigParser
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

## Variables

In [2]:
file = '_credentials.conf'
config = ConfigParser()
config.read(file)
mongo_username = config['mongo_atlas_princesofindia']['username']
mongo_password = config['mongo_atlas_princesofindia']['password']
huggingFaceAccess_token = config['huggingFace']['token']
mongo_uri = f'mongodb+srv://{mongo_username}:{mongo_password}@princesofindia.vb2f8zo.mongodb.net/?retryWrites=true&w=majority&appName=princesofindia' 
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=huggingFaceAccess_token)
embedding_model = SentenceTransformer("thenlper/gte-large")

## Connect to MondoDB Atlas

In [3]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

In [4]:
# Connect + Create MongoDB collection
mongo_client = get_mongo_client(mongo_uri)
db = mongo_client["princesofindia"]
collection = db["princesofindia_collection"]

Connection to MongoDB successful


## Mongo Atlas Vector Search

In [5]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [6]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """
    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)
    
    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 5,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "name": 1,  # Include the name field
                "bio": 1,  # Include the bio field
                "region": 1,  # Include the region field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [7]:
def get_search_result(query, collection):
    get_knowledge = vector_search(query, collection)
    search_result = ""
    for result in get_knowledge:
        search_result += f"Name: {result.get('name', 'N/A')}, Plot: {result.get('bio', 'N/A')}\n"
    return search_result

## Create user queries (Grounding) > Passing to Gemma (LLM) > Result

In [8]:
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# Conduct query with retrieval of sources
query = "List the names of top 3 richest Princes and their weatlh in Rupees? Also, thier DOB and State."
source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
print(combined_information)

Query: List the names of top 3 richest Princes and their weatlh in Rupees? Also, thier DOB and State.
Continue to answer the query by using the Search Results:
Name: RAJA RAMPAL SINGH, Plot: of ITAUNJA was born in 1877. The income of his Estate is about one lakh and twentyfive thousand rupees. He pays 44 thousand rupees as land revenue. The family has helped Government at every crisis and rendered valuable services during the Great War and at other trying times.
Name: THAKUR GOPAL SINGH OF BADNOR, Plot: a first-class noble of Udaipur State (Rajputana), lakh, an the estate comprising 125 villages and yielding an annual income of over a Jodha, founder of Jodhpur. His ancestor, Rao Jai Mal, emigrated to Mewar in the time of the Rajput of the Mertia clan. He is a descendant of Rao Duda, a younger son of Rao tribute payable to the Durbar being Rs. 4,124. He is a Rathore Maharana Udai Singh and is immortalised in history for his heroic fight unto death against Akbar during the siege of Chitt

In [10]:
input_ids = tokenizer(combined_information, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: List the names of top 3 richest Princes and their weatlh in Rupees? Also, thier DOB and State.
Continue to answer the query by using the Search Results:
Name: RAJA RAMPAL SINGH, Plot: of ITAUNJA was born in 1877. The income of his Estate is about one lakh and twentyfive thousand rupees. He pays 44 thousand rupees as land revenue. The family has helped Government at every crisis and rendered valuable services during the Great War and at other trying times.
Name: THAKUR GOPAL SINGH OF BADNOR, Plot: a first-class noble of Udaipur State (Rajputana), lakh, an the estate comprising 125 villages and yielding an annual income of over a Jodha, founder of Jodhpur. His ancestor, Rao Jai Mal, emigrated to Mewar in the time of the Rajput of the Mertia clan. He is a descendant of Rao Duda, a younger son of Rao tribute payable to the Durbar being Rs. 4,124. He is a Rathore Maharana Udai Singh and is immortalised in history for his heroic fight unto death against Akbar during the siege of 