In [12]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('imdb-movies-dataset.csv')
df.head()

Unnamed: 0,Poster,Title,Year,Certificate,Duration (min),Genre,Rating,Metascore,Director,Cast,Votes,Description,Review Count,Review Title,Review
0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023.0,R,115.0,"Comedy, Drama, Romance",6.4,67.0,Michael Showalter,"Anne Hathaway, Nicholas Galitzine, Ella Rubin,...",28744,"Solène, a 40-year-old single mom, begins an un...",166,Hypocrisy as an idea,"This film, as well as the reaction to it, is a..."
1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023.0,PG-13,145.0,"Action, Adventure, Sci-Fi",7.3,66.0,Wes Ball,"Owen Teague, Freya Allan, Kevin Durand, Peter ...",22248,"Many years after the reign of Caesar, a young ...",183,A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a..."
2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023.0,PG-13,97.0,"Biography, Comedy, History",5.5,42.0,Jerry Seinfeld,"Isaac Bae, Jerry Seinfeld, Chris Rickett, Rach...",18401,"In 1963 Michigan, business rivals Kellogg's an...",333,not funny,Pretty much the worst criticism you can lay on...
3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023.0,PG-13,126.0,"Action, Comedy, Drama",7.3,73.0,David Leitch,"Ryan Gosling, Emily Blunt, Aaron Taylor-Johnso...",38953,A down-and-out stuntman must find the missing ...,384,Everything you needed and more!,Just got out of the Austin premier at SXSW and...
4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023.0,R,131.0,"Drama, Romance, Sport",7.7,82.0,Luca Guadagnino,"Zendaya, Mike Faist, Josh O'Connor, Darnell Ap...",32517,"Tashi, a former tennis prodigy turned coach, t...",194,"Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...


In [13]:
# Columns we will use
df = df[['Title', 'Genre', 'Director', 'Cast', 'Description', 'Rating', 'Votes', 'Review', 'Review Title']]

# Drop rows with any missing values in the critical columns
df.dropna(subset=['Title', 'Genre', 'Director', 'Cast', 'Description', 'Rating', 'Review', 'Review Title'], inplace=True)

# Clean and convert the 'Votes' column to a numeric type
df['Votes'] = df['Votes'].str.replace(',', '', regex=True).astype(int)

# Combine all relevant text features into a single column for embedding
# This now includes the review content and title for more descriptive embeddings.
df['text_for_embedding'] = df['Title'] + " " + df['Genre'] + " " + df['Director'] + " " + df['Cast'] + " " + df['Description'] + " " + df['Review Title'] + " " + df['Review']

# Display the cleaned data information
print("Cleaned DataFrame Info:")
df.info()

# Display a sample of the cleaned data
print("\nSample of Cleaned Data:")

df.head()

Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 9478 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               9478 non-null   object 
 1   Genre               9478 non-null   object 
 2   Director            9478 non-null   object 
 3   Cast                9478 non-null   object 
 4   Description         9478 non-null   object 
 5   Rating              9478 non-null   float64
 6   Votes               9478 non-null   int64  
 7   Review              9478 non-null   object 
 8   Review Title        9478 non-null   object 
 9   text_for_embedding  9478 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 814.5+ KB

Sample of Cleaned Data:


Unnamed: 0,Title,Genre,Director,Cast,Description,Rating,Votes,Review,Review Title,text_for_embedding
0,The Idea of You,"Comedy, Drama, Romance",Michael Showalter,"Anne Hathaway, Nicholas Galitzine, Ella Rubin,...","Solène, a 40-year-old single mom, begins an un...",6.4,28744,"This film, as well as the reaction to it, is a...",Hypocrisy as an idea,"The Idea of You Comedy, Drama, Romance Michael..."
1,Kingdom of the Planet of the Apes,"Action, Adventure, Sci-Fi",Wes Ball,"Owen Teague, Freya Allan, Kevin Durand, Peter ...","Many years after the reign of Caesar, a young ...",7.3,22248,"I'm a big fan of all the planet of the apes, a...",A phenomenal start to another trilogy!,"Kingdom of the Planet of the Apes Action, Adve..."
2,Unfrosted,"Biography, Comedy, History",Jerry Seinfeld,"Isaac Bae, Jerry Seinfeld, Chris Rickett, Rach...","In 1963 Michigan, business rivals Kellogg's an...",5.5,18401,Pretty much the worst criticism you can lay on...,not funny,"Unfrosted Biography, Comedy, History Jerry Sei..."
3,The Fall Guy,"Action, Comedy, Drama",David Leitch,"Ryan Gosling, Emily Blunt, Aaron Taylor-Johnso...",A down-and-out stuntman must find the missing ...,7.3,38953,Just got out of the Austin premier at SXSW and...,Everything you needed and more!,"The Fall Guy Action, Comedy, Drama David Leitc..."
4,Challengers,"Drama, Romance, Sport",Luca Guadagnino,"Zendaya, Mike Faist, Josh O'Connor, Darnell Ap...","Tashi, a former tennis prodigy turned coach, t...",7.7,32517,This is a tough one. I liked the concept and t...,"Watch ""Match Point"" instead","Challengers Drama, Romance, Sport Luca Guadagn..."


In [4]:
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings


In [5]:
class MyEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def __call__(self, input: Documents) -> Embeddings:
        # Encode input texts and convert to list
        return self.model.encode(input).tolist()


In [2]:
from chromadb.config import Settings
import chromadb

client = chromadb.PersistentClient(path="./chromadb_persist", settings=Settings())



In [6]:
# Custom embedding function initialization
embedding_function = MyEmbeddingFunction('all-MiniLM-L6-v2')


In [7]:
collection_name = "movies_collection"
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(name=collection_name)

collection = client.create_collection(
    name=collection_name,
    embedding_function=embedding_function
)


In [9]:
# Prepare data
ids = [str(i) for i in df.index]
documents = df['text_for_embedding'].tolist()
metadatas = df.drop(columns=['text_for_embedding']).to_dict('records')

batch_size = 5000  # Less than max allowed batch size (e.g., 5461)

print(f"Adding {len(documents)} documents to ChromaDB in batches...")

for i in range(0, len(documents), batch_size):
    batch_ids = ids[i:i + batch_size]
    batch_documents = documents[i:i + batch_size]
    batch_metadatas = metadatas[i:i + batch_size]

    print(f"Adding batch {i // batch_size + 1} with {len(batch_documents)} documents...")
    collection.add(
        ids=batch_ids,
        documents=batch_documents,
        metadatas=batch_metadatas
    )

print("All data successfully added to ChromaDB.")


Adding 9478 documents to ChromaDB in batches...
Adding batch 1 with 5000 documents...
Adding batch 2 with 4478 documents...
All data successfully added to ChromaDB.


In [8]:
import google.generativeai as genai


genai.configure(api_key="AIzaSyDI5bjOd6RVyJDlP84-H2uJfxK9IkpwEYo") 

def get_rag_recommendations(user_query, n_results=3):
    """
    Performs the full RAG process: Retrieval + Augmented Generation.
    """
    # 1. RETRIEVAL: Query ChromaDB to get the most relevant movie data.
    # The 'collection' object is assumed to be already created from a previous cell.
    results = collection.query(
        query_texts=[user_query],
        n_results=n_results,
    )
    
    # 2. AUGMENTATION: Prepare the context for the LLM.
    context = ""
    for movie in results['metadatas'][0]:
        title = movie.get('Title', 'N/A')
        description = movie.get('Description', 'N/A')
        rating = movie.get('Rating', 'N/A')
        review = movie.get('Review', 'N/A')
        
        context += f"Title: {title}\n"
        context += f"Description: {description}\n"
        context += f"Rating: {rating}\n"
        context += f"Review: {review}\n\n"
    
    # The full prompt sent to the LLM
    llm_prompt = f"""
    Based on the following movie information, please provide a helpful and friendly recommendation for the user.
    
    User Query: {user_query}
    
    Movie Data:
    {context}
    
    Recommendation:
    """
    
    # 3. GENERATION: Send the augmented prompt to the Gemini LLM.
    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    response = model.generate_content(llm_prompt)
    llm_output = response.text
    
    return llm_output




In [9]:
#  Define the user's query
user_query = "a love story where they separate"
#  Call the function and get the recommendation
recommendation = get_rag_recommendations(user_query)
print(recommendation)




Please provide the movie data.  I need information about movies to give a recommendation based on the user's query for a love story where the couple separates.  I need titles, brief descriptions, or any other relevant information about movies you have in your database.



Basic Direct Prompting (Zero-Shot)

In [10]:
def get_rag_recommendations_basic(user_query, n_results=3):
    results = collection.query(query_texts=[user_query], n_results=n_results)
    context = ""
    for movie in results['metadatas'][0]:
        context += f"Title: {movie.get('Title', 'N/A')}\nDescription: {movie.get('Description', 'N/A')}\n\n"

    prompt = f"""
    Recommend movies based on the following information:
    {context}
    User query: {user_query}
    Recommendation:
    """

    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    response = model.generate_content(prompt)
    return response.text


In [11]:
user_query = input("Enter your movie query for Basic Prompting:  ")
print("Output:")
print(get_rag_recommendations_basic(user_query))
 

Output:
To give good recommendations, I need more information about the user's preferences.  A movie "centered around cancer" is very broad.  What aspects of cancer are they interested in?  Do they want a comedy, drama, documentary, thriller, or something else?  What kind of tone are they looking for?  Hopeful? Somber?  Realistic?  

Here are some recommendations based on different possible interpretations of the user query:

**If they want a realistic and emotionally resonant drama:**

* **My Sister's Keeper:** Focuses on a family dealing with a child's leukemia and the ethical dilemmas surrounding organ donation.
* **Manchester by the Sea:** While not solely about cancer, the character's grief and coping mechanisms are deeply affected by a significant loss, and this grief is a major plot driver.
* **The Fault in Our Stars:** A young adult romance dealing with the realities of cancer and finding love amidst difficult circumstances.  (Note: this is more optimistic than some other optio

Chain of Thought Prompting

In [38]:
def get_rag_recommendations_chain_of_thought(user_query, n_results=3):
    results = collection.query(query_texts=[user_query], n_results=n_results)
    context = ""
    for movie in results['metadatas'][0]:
        context += f"Title: {movie.get('Title', 'N/A')}, Description: {movie.get('Description', 'N/A')}\n"

    prompt = f"""
    Consider each movie carefully before making your recommendation.
    Here are the movies relevant to the user's query:

    {context}

    I'll think step by step.
    User query: {user_query}
    Step-by-step reasoning:
    """

    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    response = model.generate_content(prompt)
    return response.text


In [39]:
user_query = input("Enter your movie query for Basic Prompting:  ")
print("Output:")
print(get_rag_recommendations_chain_of_thought(user_query))

Output:
Step-by-step reasoning for recommending movies about cancer, given no specific movie titles:

1. **Clarify the User's Query:** The query "Movies about cancer" is broad.  To give better recommendations, I need to understand what aspects of cancer the user is interested in.  Is it:

    * **The medical aspects?**  (e.g., documentaries about treatments, research, etc.)
    * **The emotional impact on patients and families?** (e.g., dramas focusing on coping mechanisms, relationships, etc.)
    * **Specific types of cancer?** (e.g., leukemia, breast cancer, etc.)
    * **A particular age group affected?** (e.g., children with cancer, adults facing a diagnosis, etc.)
    * **A specific tone?** (e.g., uplifting stories of survival, gritty realistic portrayals, comedic approaches to coping, etc.)

2. **Gather Relevant Information (if possible):** If I have access to a movie database, I can search for films containing keywords related to cancer,  checking descriptions for plot summarie

Few-Shot Prompting with Examples

In [None]:
def get_rag_recommendations_few_shot(user_query, n_results=3):
    example_prompt = """
    Example 1:
    User query: "Recommend romantic comedy movies."
    Recommendation: "I suggest 'Crazy Rich Asians' because it is a fun romantic comedy..."

    Example 2:
    User query: "Suggest action thriller movies."
    Recommendation: "'John Wick' is a great action thriller with intense sequences..."

    Now, based on the movies below and the user's query, provide a recommendation.

    """

    results = collection.query(query_texts=[user_query], n_results=n_results)
    context = ""
    for movie in results['metadatas'][0]:
        context += f"Title: {movie.get('Title', 'N/A')}\nDescription: {movie.get('Description', 'N/A')}\n"

    full_prompt = f"""
    {example_prompt}

    Movies:
    {context}

    User query: {user_query}
    Recommendation:
    """

    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    response = model.generate_content(full_prompt)
    return response.text


In [None]:
user_query = input("Enter your movie query for Basic Prompting:  ")
print("Output:")
print(get_rag_recommendations_few_shot(user_query))

Output:
Recommendation:  "I would recommend 'Malignant'.  While I don't want to spoil the plot, it's a horror movie with a significant and unexpected twist."



 Instruction + Formatting Constraints

In [None]:
def get_rag_recommendations_instructional(user_query, n_results=3):
    results = collection.query(query_texts=[user_query], n_results=n_results)
    context = ""
    for movie in results['metadatas'][0]:
        context += f"- Title: {movie.get('Title', 'N/A')}\n  Description: {movie.get('Description', 'N/A')}\n"

    prompt = f"""
    You are a helpful, concise movie recommender.
    Based on the following movie data, provide 3 clear recommendations for the user in bullet points.

    Movie data:
    {context}

    User query: {user_query}

    Please format the recommendations as:
    1. Movie title - brief reason.
    2. ...
    3. ...
    """

    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    response = model.generate_content(prompt)
    return response.text


In [None]:
user_query = input("Enter your movie query for Basic Prompting:  ")
print("Output:")
print(get_rag_recommendations_instructional(user_query))

Output:
* 1. 50/50 -  A comedic drama based on a true story about a man's battle with cancer, touching on themes of friendship and facing life's challenges.
* 2. Candy - Explores a destructive but intensely passionate love story intertwined with addiction, mirroring the life-or-death struggle found in facing cancer.
* 3. The English Patient - While not directly about cancer, the film depicts a powerful and enduring love story amidst the backdrop of war and hardship, echoing the perseverance theme found in cancer narratives.

