## Libraries

In [1]:
# pip install datasets pandas pymongo sentence_transformers
# pip install -U transformers
# huggingface-cli login

In [2]:
import pandas as pd
import pymongo
from datasets import load_dataset
from configparser import ConfigParser
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

## Variables

In [3]:
file = '_credentials.conf'
config = ConfigParser()
config.read(file)
mongo_username = config['mongo_atlas_princesofindia']['username']
mongo_password = config['mongo_atlas_princesofindia']['password']
huggingFaceAccess_token = config['huggingFace']['token']
mongo_uri = f'mongodb+srv://{mongo_username}:{mongo_password}@princesofindia.vb2f8zo.mongodb.net/?retryWrites=true&w=majority&appName=princesofindia' 
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=huggingFaceAccess_token)

## Step1: Load DATA | Generate embeddings

In [4]:
# Load dataset to a pandas DataFrame
dataset_df = pd.read_csv("https://raw.githubusercontent.com/mrunal-modi/princesofindia-data/main/v2/_persons_v2.csv", encoding='latin1')

In [5]:
# Embedding models convert high-dimensional data such as text, audio, and images into a 
# lower-dimensional numerical representation that captures the input data's semantics and context. 
# This embedding representation of data can be used to conduct semantic searches 
# based on the positions and proximity of embeddings to each other within a vector space.
# The embedding model used in the RAG system is the Generate Text Embedding (GTE) model, based on the BERT model. 
# The GTE embedding models come in three variants, mentioned below, 
# and were trained and released by Alibaba DAMO Academy, a research institution.
# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")

In [6]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [None]:
dataset_df["embedding"] = dataset_df["bio"].apply(get_embedding)

## Step2: Ingest DATA | Create Vector Search

In [None]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

In [None]:
# Connect + Create MongoDB collection
mongo_client = get_mongo_client(mongo_uri)
db = mongo_client["princesofindia"]
collection = db["princesofindia_collection"]

In [None]:
# Delete any existing records in the collection
collection.delete_many({})

In [None]:
# Ingest data into MongoDB
documents = dataset_df.to_dict('records')
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

In [None]:
# Vector search index creation using Mongo Atlas > Atlas Search > JSON Editor

{
 "fields": [{
     "numDimensions": 1024,
     "path": "embedding",
     "similarity": "cosine",
     "type": "vector"
   }]
}

In [None]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """
    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)
    
    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 100,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "name": 1,  # Include the name field
                "bio": 1,  # Include the bio field
                "region": 1,  # Include the region field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [None]:
def get_search_result(query, collection):
    get_knowledge = vector_search(query, collection)
    search_result = ""
    for result in get_knowledge:
        search_result += f"Name: {result.get('name', 'N/A')}, Plot: {result.get('bio', 'N/A')}\n"
    return search_result

## Step3: Create user queries (Grounding) > Passing to Gemma (LLM) > Result

In [None]:
# A user query is defined in the code snippet above; 
# this query is the target for semantic search against the movie embeddings in the database collection. 
# The query and vector search results are combined into a single string to pass as a full context 
# to the base model for the RAG system. 

In [None]:
# Conduct query with retrieval of sources
query = "Which Prince is the richest and from which state?"
source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
print(combined_information)

In [None]:
# CPU Enabled uncomment below
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# # GPU Enabled use below
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

In [None]:
input_ids = tokenizer(combined_information, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))