## Pre-requisites

In [None]:
# Sign up for huggingface and get API Access Key token
# Create MongoDB Atlas project called RAG and get the URI / Credentials
# Create MongoDB Atlas search called "vector_index" for the above collection within the project

In [None]:
# pip install datasets pandas pymongo sentence_transformers
# pip install -U transformers
# huggingface-cli login

## Libraries

In [1]:
import pandas as pd
import pymongo
from datasets import load_dataset
from configparser import ConfigParser
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

## Variables

In [2]:
file = '_credentials.conf'
config = ConfigParser()
config.read(file)
mongo_username = config['mongo_atlas']['username']
mongo_password = config['mongo_atlas']['password']
huggingFaceAccess_token = config['huggingFace']['token']
mongo_uri = f'mongodb+srv://{mongo_username}:{mongo_password}@movies.xm5qyjz.mongodb.net/?retryWrites=true&w=majority&appName=movies' 
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=huggingFaceAccess_token)

## Step1: Load DATA | Generate embeddings

In [3]:
# Load dataset to a pandas DataFrame
dataset = load_dataset("MongoDB/embedded_movies") # https://huggingface.co/datasets/MongoDB/embedded_movies
dataset_df = pd.DataFrame(dataset['train'])

# Remove data point where plot column is missing
dataset_df = dataset_df.dropna(subset=['fullplot'])

# Remove the plot_embedding from each data point in the dataset 
# as we are going to create new embeddings with an open-source embedding model from Hugging Face: gte-large
dataset_df = dataset_df.drop(columns=['plot_embedding'])

In [4]:
# Embedding models convert high-dimensional data such as text, audio, and images into a 
# lower-dimensional numerical representation that captures the input data's semantics and context. 
# This embedding representation of data can be used to conduct semantic searches 
# based on the positions and proximity of embeddings to each other within a vector space.
# The embedding model used in the RAG system is the Generate Text Embedding (GTE) model, based on the BERT model. 
# The GTE embedding models come in three variants, mentioned below, 
# and were trained and released by Alibaba DAMO Academy, a research institution.
# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")

In [5]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [6]:
dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)

In [7]:
dataset_df

Unnamed: 0,type,metacritic,awards,cast,fullplot,plot,languages,imdb,title,countries,num_mflix_comments,genres,rated,poster,directors,runtime,writers,embedding
0,movie,,"{'nominations': 0, 'text': '1 win.', 'wins': 1}","[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",Young Pauline is left a lot of money when her ...,Young Pauline is left a lot of money when her ...,[English],"{'id': 4465, 'rating': 7.6, 'votes': 744}",The Perils of Pauline,[USA],0,[Action],,https://m.media-amazon.com/images/M/MV5BMzgxOD...,"[Louis J. Gasnier, Donald MacKenzie]",199.0,"[Charles W. Goddard (screenplay), Basil Dickey...","[-0.009285857900977135, -0.005062089767307043,..."
1,movie,,"{'nominations': 1, 'text': '1 nomination.', 'w...","[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",As a penniless man worries about how he will m...,A penniless young man tries to save an heiress...,[English],"{'id': 10146, 'rating': 7.0, 'votes': 639}",From Hand to Mouth,[USA],0,"[Comedy, Short, Action]",TV-G,https://m.media-amazon.com/images/M/MV5BNzE1OW...,"[Alfred J. Goulding, Hal Roach]",22.0,[H.M. Walker (titles)],"[-0.002439422532916069, 0.023095911368727684, ..."
2,movie,,"{'nominations': 0, 'text': '1 win.', 'wins': 1}","[Ronald Colman, Neil Hamilton, Ralph Forbes, A...","Michael ""Beau"" Geste leaves England in disgrac...","Michael ""Beau"" Geste leaves England in disgrac...",[English],"{'id': 16634, 'rating': 6.9, 'votes': 222}",Beau Geste,[USA],0,"[Action, Adventure, Drama]",,,[Herbert Brenon],101.0,"[Herbert Brenon (adaptation), John Russell (ad...","[0.012204288505017757, -0.01145576499402523, -..."
3,movie,,"{'nominations': 0, 'text': '1 win.', 'wins': 1}","[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",A nobleman vows to avenge the death of his fat...,"Seeking revenge, an athletic young man joins t...",,"{'id': 16654, 'rating': 7.2, 'votes': 1146}",The Black Pirate,[USA],1,"[Adventure, Action]",,https://m.media-amazon.com/images/M/MV5BMzU0ND...,[Albert Parker],88.0,"[Douglas Fairbanks (story), Jack Cunningham (a...","[0.004541351459920406, -0.0006100620375946164,..."
4,movie,,"{'nominations': 1, 'text': '1 nomination.', 'w...","[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...","The Uptown Boy, J. Harold Manners (Lloyd) is a...",An irresponsible young millionaire changes his...,[English],"{'id': 16895, 'rating': 7.6, 'votes': 918}",For Heaven's Sake,[USA],0,"[Action, Comedy, Romance]",PASSED,https://m.media-amazon.com/images/M/MV5BMTcxMT...,[Sam Taylor],58.0,"[Ted Wilde (story), John Grey (story), Clyde B...","[-0.0022256155498325825, 0.011567802168428898,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,movie,22.0,"{'nominations': 3, 'text': '1 win & 3 nominati...","[Zack Ward, Dave Foley, Chris Coppola, Jackie ...",The story begins with a regular Joe who tries ...,"In the ironically named city of Paradise, a re...",[English],"{'id': 486640, 'rating': 4.4, 'votes': 19641}",Postal,"[USA, Canada, Germany]",0,"[Action, Comedy, Thriller]",R,https://m.media-amazon.com/images/M/MV5BMTIzMD...,[Uwe Boll],100.0,"[Uwe Boll, Bryan C. Knight]","[0.003967737313359976, -0.010676096193492413, ..."
1496,movie,27.0,"{'nominations': 3, 'text': '3 nominations.', '...","[Tim Allen, John Travolta, Martin Lawrence, Wi...",Four middle-aged men decide to take a road tri...,A group of suburban biker wannabes looking for...,[English],"{'id': 486946, 'rating': 5.9, 'votes': 94699}",Wild Hogs,[USA],0,"[Action, Adventure, Comedy]",PG-13,https://m.media-amazon.com/images/M/MV5BZWZlMz...,[Walt Becker],100.0,[Brad Copeland],"[0.004870419390499592, 0.0033226131927222013, ..."
1497,movie,,"{'nominations': 13, 'text': '14 wins & 13 nomi...","[Ajay Devgn, Kareena Kapoor, Saif Ali Khan, Ko...",Advocate Raghunath Mishra has arranged the mar...,"Shakespeare's masterpiece ""Othello"" set in mod...",[Hindi],"{'id': 488414, 'rating': 8.2, 'votes': 9800}",Omkara,[India],1,"[Action, Crime, Drama]",,https://m.media-amazon.com/images/M/MV5BY2NmNj...,[Vishal Bhardwaj],155.0,"[Vishal Bhardwaj (screenplay), Robin Bhatt (sc...","[0.0016241184202954173, -0.012888934463262558,..."
1498,movie,,"{'nominations': 1, 'text': '1 nomination.', 'w...","[Mena Suvari, Nick Cannon, Michael Welch, Anna...","In Leadville, Colorado, Captain Rhodes and his...",When a small Colorado town is overrun by the f...,[English],"{'id': 489018, 'rating': 4.5, 'votes': 17177}",Day of the Dead,[USA],1,"[Action, Horror]",R,https://m.media-amazon.com/images/M/MV5BNzg1Mj...,[Steve Miner],86.0,"[Jeffrey Reddick (screenplay), George A. Romer...","[0.005159149877727032, -0.007672071922570467, ..."


## Step2: Ingest DATA | Create Vector Search

In [8]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

In [9]:
# Connect + Create MongoDB collection
mongo_client = get_mongo_client(mongo_uri)
db = mongo_client["movies"]
collection = db["movie_collection"]

Connection to MongoDB successful


In [10]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 1452, 'electionId': ObjectId('7fffffff00000000000000a6'), 'opTime': {'ts': Timestamp(1711540349, 98), 't': 166}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1711540349, 99), 'signature': {'hash': b'\x16v\xec\xc5\xb4\x8f(HR\xdb\xa1y\xd7\xfd\x1c\xb5\xb1YQ\x83', 'keyId': 7294719201674002433}}, 'operationTime': Timestamp(1711540349, 98)}, acknowledged=True)

In [11]:
# Ingest data into MongoDB
documents = dataset_df.to_dict('records')
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [12]:
# Vector search index creation using Mongo Atlas > Atlas Search > JSON Editor

# {
#  "fields": [{
#      "numDimensions": 1024,
#      "path": "embedding",
#      "similarity": "cosine",
#      "type": "vector"
#    }]
# }

In [13]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """
    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)
    
    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "fullplot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1,  # Include the genres field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [14]:
def get_search_result(query, collection):
    get_knowledge = vector_search(query, collection)
    search_result = ""
    for result in get_knowledge:
        search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('fullplot', 'N/A')}\n"
    return search_result

## Step3: Create user queries (Grounding) > Passing to Gemma (LLM) > Result

In [15]:
# A user query is defined in the code snippet above; 
# this query is the target for semantic search against the movie embeddings in the database collection. 
# The query and vector search results are combined into a single string to pass as a full context 
# to the base model for the RAG system. 

In [16]:
# Conduct query with retrieval of sources
query = "What is the best romantic movie to watch and why?"
source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
print(combined_information)

Query: What is the best romantic movie to watch and why?
Continue to answer the query by using the Search Results:
Title: Titanic, Plot: The plot focuses on the romances of two couples upon the doomed ship's maiden voyage. Isabella Paradine (Catherine Zeta-Jones) is a wealthy woman mourning the loss of her aunt, who reignites a romance with former flame Wynn Park (Peter Gallagher). Meanwhile, a charming ne'er-do-well named Jamie Perse (Mike Doyle) steals a ticket for the ship, and falls for a sweet innocent Irish girl on board. But their romance is threatened by the villainous Simon Doonan (Tim Curry), who has discovered about the ticket and makes Jamie his unwilling accomplice, as well as having sinister plans for the girl.
Title: Miracle Mile, Plot: A young man meets and falls in love with a young woman at the La Brea Tar Pits in Los Angeles. This area is known as Miracle Mile, and the whole movie takes place there. They make a date, which he misses, and while he is searching for her

In [17]:
# CPU Enabled uncomment below
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# # GPU Enabled use below
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
input_ids = tokenizer(combined_information, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: What is the best romantic movie to watch and why?
Continue to answer the query by using the Search Results:
Title: Titanic, Plot: The plot focuses on the romances of two couples upon the doomed ship's maiden voyage. Isabella Paradine (Catherine Zeta-Jones) is a wealthy woman mourning the loss of her aunt, who reignites a romance with former flame Wynn Park (Peter Gallagher). Meanwhile, a charming ne'er-do-well named Jamie Perse (Mike Doyle) steals a ticket for the ship, and falls for a sweet innocent Irish girl on board. But their romance is threatened by the villainous Simon Doonan (Tim Curry), who has discovered about the ticket and makes Jamie his unwilling accomplice, as well as having sinister plans for the girl.
Title: Miracle Mile, Plot: A young man meets and falls in love with a young woman at the La Brea Tar Pits in Los Angeles. This area is known as Miracle Mile, and the whole movie takes place there. They make a date, which he misses, and while he is searching fo