## Libraries

In [1]:
import pandas as pd
import pymongo
from datasets import load_dataset
from configparser import ConfigParser
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

## Variables

In [2]:
file = '_credentials.conf'
config = ConfigParser()
config.read(file)
mongo_username = config['mongo_atlas_princesofindia']['username']
mongo_password = config['mongo_atlas_princesofindia']['password']
huggingFaceAccess_token = config['huggingFace']['token']
mongo_uri = f'mongodb+srv://{mongo_username}:{mongo_password}@princesofindia.vb2f8zo.mongodb.net/?retryWrites=true&w=majority&appName=princesofindia' 
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=huggingFaceAccess_token)
embedding_model = SentenceTransformer("thenlper/gte-large")
states_data = "https://raw.githubusercontent.com/mrunal-modi/princesofindia-data/main/v2/_states_v2.csv"
persons_data = "https://raw.githubusercontent.com/mrunal-modi/princesofindia-data/main/v2/_persons_v2.csv"

## Generate embeddings

In [3]:
persons_df = pd.read_csv(persons_data, encoding='latin1')

In [4]:
states_df = pd.read_csv(states_data, encoding='latin1')

In [5]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [6]:
persons_df["embedding"] = persons_df["bio"].apply(get_embedding)

In [7]:
states_df["embedding"] = states_df["state_description"].apply(get_embedding)

## Connect to MondoDB Atlas

In [8]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

In [9]:
# Connect + Create MongoDB collection
mongo_client = get_mongo_client(mongo_uri)
db = mongo_client["princesofindia"]

Connection to MongoDB successful


## Ingest Data into MongoDB

In [10]:
persons_collection = db["persons_collection"]
persons_collection.delete_many({}) # Delete if any existing records in the collection
persons_documents = persons_df.to_dict('records')
persons_collection.insert_many(persons_documents)
print("Persons Data ingestion into MongoDB completed")

Persons Data ingestion into MongoDB completed


In [11]:
states_collection = db["states_collection"]
states_collection.delete_many({}) # Delete if any existing records in the collection
states_documents = states_df.to_dict('records')
states_collection.insert_many(states_documents)
print("States Data ingestion into MongoDB completed")

States Data ingestion into MongoDB completed
