## Libraries

In [1]:
# pip install datasets pandas pymongo sentence_transformers
# pip install -U transformers
# huggingface-cli login

In [9]:
import pandas as pd
import pymongo
from datasets import load_dataset
from configparser import ConfigParser
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

## Variables

In [10]:
file = '_credentials.conf'
config = ConfigParser()
config.read(file)
mongo_username = config['mongo_atlas_princesofindia']['username']
mongo_password = config['mongo_atlas_princesofindia']['password']
huggingFaceAccess_token = config['huggingFace']['token']
mongo_uri = f'mongodb+srv://{mongo_username}:{mongo_password}@princesofindia.vb2f8zo.mongodb.net/?retryWrites=true&w=majority&appName=princesofindia' 
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=huggingFaceAccess_token)

## Step1: Load DATA | Generate embeddings

In [11]:
# Load dataset to a pandas DataFrame
dataset_df = pd.read_csv("https://raw.githubusercontent.com/mrunal-modi/princesofindia-data/main/v2/_persons_v2.csv", encoding='latin1')

In [21]:
dataset_df

Unnamed: 0,name,bio,region,locations,embedding
0,ALI-MARTABAT MUSHIRUL-MULK ALI-QADR KAZI ALI H...,"Political Member, Government Born 1891. Belong...",,"['bhopal', 'oudh', 'cambridge', 'london']","[-0.025843340903520584, -0.02923215925693512, ..."
1,NAWAB SIR AHMED HUSSAIN AMIN JUNG BAHADUR,"M.A., B.L., LL.D., K.C.I.E., C.S.I., Hyderabad...",,"['bhopal', 'madras', 'hyderabad']","[-0.03182370215654373, -0.026625320315361023, ..."
2,"AN. AINDLEY, DEWAN, KOREA STATE",Eastern States Agency. Educated at Mission Sch...,,"['korea', 'datia', 'delhi']","[-0.022180920466780663, -0.01945931650698185, ..."
3,SIR C. P. RAMASWAMY AIYAR,"B.A., B.L., K.C.I.E., Advocate, High Court, Ma...",,"['patiala', 'travancore', 'madras', 'london']","[-0.026867331936955452, 0.0015552719123661518,..."
4,ANANT RAM,"Bar-at-Law, Chief Minister, scion Baghat Stat...",,"['baghat', 'punjab', 'simla']","[-0.007894985377788544, -0.02112034521996975, ..."
...,...,...,...,...,...
1602,KHAN BAHADUR HAJI SHEIKH MOHAMMAD YUSAF KHAN,"Zamindar and Honorary Magistrate, Barh (Patna)...",U. P. & AJMER-MERWARA,['patna'],"[-0.013128438033163548, 0.006783933378756046, ..."
1603,RAI BAHADUR CHAUDHRI NARAIN SINGH,"Rais-i-Azam and Divisional Durbari, Shujabad, ...",U. P. & AJMER-MERWARA,"['karachi', 'lahore', 'simla', 'multan', 'durb...","[-0.013677764683961868, 0.0056019676849246025,..."
1604,THE HONOURABLE RAJA SIR RAGHU NANDAN PRASAD,"KT., Rajbati, Monghyr, Behar. Born 1882. Estat...",U. P. & AJMER-MERWARA,"['orissa', 'bhagalpur', 'monghyr', 'gaya', 'pu...","[-0.02250373549759388, 0.007124797906726599, -..."
1605,RAI SAHIB CHUNI LAL RASTOGI,"Rais, Landlord, Honorary The Hon Raja Sir Ragh...",U. P. & AJMER-MERWARA,"['patna', 'monghyr']","[-0.04017592966556549, 0.02038654126226902, 0...."


In [13]:
# Embedding models convert high-dimensional data such as text, audio, and images into a 
# lower-dimensional numerical representation that captures the input data's semantics and context. 
# This embedding representation of data can be used to conduct semantic searches 
# based on the positions and proximity of embeddings to each other within a vector space.
# The embedding model used in the RAG system is the Generate Text Embedding (GTE) model, based on the BERT model. 
# The GTE embedding models come in three variants, mentioned below, 
# and were trained and released by Alibaba DAMO Academy, a research institution.
# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")

In [14]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [15]:
dataset_df["embedding"] = dataset_df["bio"].apply(get_embedding)

In [20]:
dataset_df.to_csv('princesofindia_gte-large_embedding.csv', index=False)

## Step2: Ingest DATA | Create Vector Search

In [17]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

In [18]:
# Connect + Create MongoDB collection
mongo_client = get_mongo_client(mongo_uri)
db = mongo_client["princesofindia"]
collection = db["princesofindia_collection"]

# Delete any existing records in the collection
collection.delete_many({})

Connection to MongoDB successful


DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000129'), 'opTime': {'ts': Timestamp(1711519565, 101), 't': 297}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1711519566, 6), 'signature': {'hash': b'\xd4\xad\x1d\x91\xe8n#\xf5\x0c(8\xd5\xb0Z\n[\xe2\xb7\x0b\xcf', 'keyId': 7304382066341183489}}, 'operationTime': Timestamp(1711519565, 101)}, acknowledged=True)

In [19]:
# Ingest data into MongoDB
documents = dataset_df.to_dict('records')
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [None]:
# Vector search index creation using Mongo Atlas > Atlas Search > JSON Editor

{
 "fields": [{
     "numDimensions": 1024,
     "path": "embedding",
     "similarity": "cosine",
     "type": "vector"
   }]
}

In [30]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """
    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)
    
    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 100,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "name": 1,  # Include the name field
                "bio": 1,  # Include the bio field
                "region": 1,  # Include the region field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [23]:
def get_search_result(query, collection):
    get_knowledge = vector_search(query, collection)
    search_result = ""
    for result in get_knowledge:
        search_result += f"Name: {result.get('name', 'N/A')}, Plot: {result.get('bio', 'N/A')}\n"
    return search_result

## Step3: Create user queries (Grounding) > Passing to Gemma (LLM) > Result

In [24]:
# A user query is defined in the code snippet above; 
# this query is the target for semantic search against the movie embeddings in the database collection. 
# The query and vector search results are combined into a single string to pass as a full context 
# to the base model for the RAG system. 

In [28]:
# Conduct query with retrieval of sources
# query = "Which Prince is the richest and from which state?"
query = "Rank the Princes from richest to poorest in desc order. Also, mentio"
source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
print(combined_information)

Query: List and rank the Princes from richest to poorest such that we have 3 columns, Name, Rupees, State.
Continue to answer the query by using the Search Results:
Name: RAJA RAMPAL SINGH, Plot: of ITAUNJA was born in 1877. The income of his Estate is about one lakh and twentyfive thousand rupees. He pays 44 thousand rupees as land revenue. The family has helped Government at every crisis and rendered valuable services during the Great War and at other trying times.
Name: RAO BAHADUR RAOJI RAGHUNATH SHIRGAOKAR, Plot: B.A., LL.B., Diwan of Savantwadi State (retired), Shahupuri, Kolhapur, Deccan. Born 21st October, 1867. Served as Karbhari to the Kolhapur Feudatory Jagir of Sarashkar Bahadur, 1897-1901 ; Baroda State, 1901-07; Household Department, Baroda State, 1907-09 Indian Guardian to H. H. the Gaikwar's sons in England and America, 1910-12; Revenue Department, 1913-16 and 1922-23 Head of Household, and Private Secretary in Europe, 1917-21 and 1923-24 Revenue Department, 1925; Guard

In [26]:
# CPU Enabled uncomment below
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# # GPU Enabled use below
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
input_ids = tokenizer(combined_information, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: List and rank the Princes from richest to poorest such that we have 3 columns, Name, Rupees, State.
Continue to answer the query by using the Search Results:
Name: RAJA RAMPAL SINGH, Plot: of ITAUNJA was born in 1877. The income of his Estate is about one lakh and twentyfive thousand rupees. He pays 44 thousand rupees as land revenue. The family has helped Government at every crisis and rendered valuable services during the Great War and at other trying times.
Name: RAO BAHADUR RAOJI RAGHUNATH SHIRGAOKAR, Plot: B.A., LL.B., Diwan of Savantwadi State (retired), Shahupuri, Kolhapur, Deccan. Born 21st October, 1867. Served as Karbhari to the Kolhapur Feudatory Jagir of Sarashkar Bahadur, 1897-1901 ; Baroda State, 1901-07; Household Department, Baroda State, 1907-09 Indian Guardian to H. H. the Gaikwar's sons in England and America, 1910-12; Revenue Department, 1913-16 and 1922-23 Head of Household, and Private Secretary in Europe, 1917-21 and 1923-24 Revenue Department, 1925; 