## Libraries

In [1]:
import pandas as pd
import pymongo
from datasets import load_dataset
from configparser import ConfigParser
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

## Variables

In [21]:
file = '_credentials.conf'
config = ConfigParser()
config.read(file)
mongo_username = config['mongo_atlas_princesofindia']['username']
mongo_password = config['mongo_atlas_princesofindia']['password']
huggingFaceAccess_token = config['huggingFace']['token']

mongo_uri = f'mongodb+srv://{mongo_username}:{mongo_password}@princesofindia.vb2f8zo.mongodb.net/?retryWrites=true&w=majority&appName=princesofindia' 

embedding_model = SentenceTransformer("thenlper/gte-large")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=huggingFaceAccess_token)

states_data = "https://raw.githubusercontent.com/mrunal-modi/princesofindia-data/main/v2/_states_v2.csv"
persons_data = "https://raw.githubusercontent.com/mrunal-modi/princesofindia-data/main/v2/_persons_v2.csv"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
# Define Mongo Atlas project variables for persons and states collections (1 = include)
persons_project_vars = {
    "_id": 0,
    "name": 1,
    "bio": 1,
    "region": 1,
    "locations": 1,
    "score": {"$meta": "vectorSearchScore"},
}

states_project_vars = {
    "_id": 0,
    "state": 1,
    "state_description": 1,
    "state_type": 1,
    "locations": 1,
    "score": {"$meta": "vectorSearchScore"},
}

## Generate embeddings

In [3]:
persons_df = pd.read_csv(persons_data, encoding='latin1')

In [4]:
states_df = pd.read_csv(states_data, encoding='latin1')

In [5]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [6]:
persons_df["embedding"] = persons_df["bio"].apply(get_embedding)

In [7]:
states_df["embedding"] = states_df["state_description"].apply(get_embedding)

In [35]:
persons_df.to_csv('persons_embedding.csv', index=False)
states_df.to_csv('states_embedding.csv', index=False)

## Connect to MondoDB Atlas

In [8]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

In [9]:
# Connect + Create MongoDB collection
mongo_client = get_mongo_client(mongo_uri)
db = mongo_client["princesofindia"]

Connection to MongoDB successful


## Ingest Data into MongoDB

In [10]:
persons_collection = db["persons_collection"]
persons_collection.delete_many({}) # Delete if any existing records in the collection
persons_documents = persons_df.to_dict('records')
persons_collection.insert_many(persons_documents)
print("Persons Data ingestion into MongoDB completed")

Persons Data ingestion into MongoDB completed


In [11]:
states_collection = db["states_collection"]
states_collection.delete_many({}) # Delete if any existing records in the collection
states_documents = states_df.to_dict('records')
states_collection.insert_many(states_documents)
print("States Data ingestion into MongoDB completed")

States Data ingestion into MongoDB completed


## Create Vector Search (vector_index) for both Collections

In [12]:
# Vector search index creation using Mongo Atlas > Atlas Search > JSON Editor

{
 "fields": [{
     "numDimensions": 1024,
     "path": "embedding",
     "similarity": "cosine",
     "type": "vector"
   }]
}

{'fields': [{'numDimensions': 1024,
   'path': 'embedding',
   'similarity': 'cosine',
   'type': 'vector'}]}

In [15]:
def vector_search(user_query, collection, project_vars):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.
    project_vars (dict): Dictionary specifying the fields to include/exclude.

    Returns:
    list: A list of matching documents.
    """
    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)
    
    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 3,  # Number of candidate matches to consider
                "limit": 3,  # Return top 4 matches
            }
        },
        {
            "$project": project_vars  # Use the provided project variables
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [27]:
def get_search_result(query, collection, project_vars):
    get_knowledge = vector_search(query, collection, project_vars)
    search_result = ""
    for result in get_knowledge:
        if 'bio' in result:
            search_result += f"Name: {result.get('name', 'N/A')}, Bio: {result.get('bio', 'N/A')}\n"
        elif 'state_description' in result:
            search_result += f"State: {result.get('state', 'N/A')}, Description: {result.get('state_description', 'N/A')}\n"
    return search_result

In [31]:
def conduct_query(query, collection, project_vars, model, tokenizer):
    source_information = get_search_result(query, collection, project_vars)
    combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}"
    input_ids = tokenizer(combined_information, return_tensors="pt")
    response = model.generate(**input_ids, max_new_tokens=500)
    generated_text = tokenizer.decode(response[0]).split("<eos>")[0].strip()
    return generated_text

In [32]:
# Conduct query for states collection
states_collection = db["states_collection"]
states_query = "Which state is the richest in total Rupees i.e. Rs.?"
states_response = conduct_query(states_query, states_collection, states_project_vars, model, tokenizer)
print(states_response)

<bos>Query: Which state is the richest in total Rupees i.e. Rs.?
Continue to answer the query by using the Search Results:
State: Jodhpur, Description: THE OF JODHPUR is the largest State in Rajputana achieved in during point the of size reign and of income the late and the most outstanding George V, of which, with reference to increased from is STATE in regard to progress and prosperity the Indian Empire, this book lamented King-Emperor The State covers an area of 36,021 square miles. The of this population State is about one crore and fifty lakhs The of rupees. word "Marwar" is a popular corruption of "Maruwar" (region is sandy and 18,41,642 in 1921 to The vast territory represented by the State is generally of death), befittingly name a record. 21,25,982 in 1931. The average annual revenue known by the of Marwar. A considerable part of the country, particularly in the west, is comparatively very fertile. poor, The a veritable desert, of the people lies in their cattle. The State pro

In [34]:
# Conduct query for persons collection
persons_collection = db["persons_collection"]
persons_query = "Which person is the richest in total Rupees i.e. Rs.?"
persons_response = conduct_query(persons_query, persons_collection, persons_project_vars, model, tokenizer)
print(persons_response)

<bos>Query: Which person is the richest in total Rupees i.e. Rs.?
Continue to answer the query by using the Search Results:
Name: LAXMI NARAIN, Bio: Rais and Banker, Jaunpur, U. P. Born 1905. Leading business magnate of Jaunpur. Contributes good. On 1937, endeavoured successfully somely 1st April, to all official public handfunds raised for to make Hartal a failure at Jaunpur, winning official appreciation. Constructing a park named after Mr. Kidwai, Collector, Jaunpur, entirely from his own pocket. Laxmi Narain,
Name: JOHARMAL JALAN, Bio: of Karanibad, Deoghar, Santhal Pargannas, was born on the 4th September, 1862, at Mukundgarh in Rajputana. He came to Calcutta in 1877 at the age of fifteen and entered business. By his remarkable intelligence, energy and perseverance he prospered marvellously in the line and attained a position of eminence among the merchants of Calcutta. All through his pretty long career as an outstanding businessman and citizen he was universally respected for hi