In [51]:
import os
import pandas as pd
from langchain_chroma import Chroma
from langchain.schema import Document
from dotenv import load_dotenv
from groq import Groq
load_dotenv()

groq_key = os.getenv("GROQ_API_KEY")
movies=pd.read_csv('cleaned_movies_dataset.csv')

In [5]:
#lets create the document on which vector embeddings will be created
#on the page_content only the vector embeddings is applied
#rest is the metadata
#metadata can only take int,str,float,bool -> the simple types
docs = []
genre_cols = [col for col in movies.columns if movies[col].isin([0,1]).all()]

for idx,row in movies.iterrows():
    title=row['Title']
    date=row['Release_Date']
    overview=row['Overview']
    genres = [genre for genre in genre_cols if row[genre] == 1]

    docs.append(Document(
        page_content=overview,
        metadata={
            "title":title,
            "release_date":date,
            "genre":", ".join(genres)
        }
    ))

docs

[Document(metadata={'title': 'Spider-Man: No Way Home', 'release_date': '2021-12-15', 'genre': 'Action, Adventure, Science Fiction'}, page_content='Peter Parker is unmasked and no longer able to separate his normal life from the high-stakes of being a super-hero. When he asks for help from Doctor Strange the stakes become even more dangerous, forcing him to discover what it truly means to be Spider-Man.'),
 Document(metadata={'title': 'The Batman', 'release_date': '2022-03-01', 'genre': 'Crime, Mystery, Thriller'}, page_content='In his second year of fighting crime, Batman uncovers corruption in Gotham City that connects to his own family while facing a serial killer known as the Riddler.'),
 Document(metadata={'title': 'No Exit', 'release_date': '2022-02-25', 'genre': 'Thriller'}, page_content='Stranded at a rest stop in the mountains during a blizzard, a recovering addict discovers a kidnapped child hidden in a car belonging to one of the people inside the building which sets her on 

In [6]:
from langchain_ollama import OllamaEmbeddings
import uuid
embeddings = OllamaEmbeddings(model="nomic-embed-text")

#force the script to create a new collection each time bcz i was getting the old stale results as well
unique_name=f"movies_{uuid.uuid4()}"

chroma_db_movies = Chroma.from_documents(docs,embedding=embeddings,collection_name=unique_name)

In [100]:
#Now lets do llm prompting to rank the returned results on basis of user query and add an explanation as to how the movie is relevant
client=Groq(
    api_key=groq_key
)

def rerank_and_explain_with_llm(query, movies_df, predicted_genres):
    movie_list = ''
    for idx, row in movies_df.iterrows():
        title = row.get("Title")
        overview = row.get("Overview")
        movie_list += f"{idx+1}. {title} - {overview}\n"

    prompts = [
        {
            "role": "system",
            "content": f"""
                        You are an intelligent movie assistant. A user gave a movie-related query. A list of movies and their overviews is provided. Based on the user's query and the predicted genres: {', '.join(predicted_genres)}, rank the movies from most to least relevant.

                        Your job:
                        1. Rank the movies below from most to least relevant to the user's query.
                        2. For each movie, explain briefly (1–2 sentences) **why it fits** the query.
                           - If it's not a perfect match, focus on what aspects still connect well with the user's theme — highlight **what they might enjoy** about it.
                           - Use a **friendly, conversational tone**, like you're recommending a close match to a friend who trusts your taste.
                           - Avoid robotic or formal tone. Be warm and helpful!

                        Return in this format:
                        1. <Movie Title> - <1–2 sentence explanation for why this movie matches the query>
                        2. ...
                        Only return the ranked list with explanations. Do not add commentary before or after.
            """
        },
        {
            "role": "user",
            "content": f'''
                    User's query: {query}

                    Movie list:
                    {movie_list}
            '''
        }
    ]

    chat_completion = client.chat.completions.create(
        messages=prompts,
        model="llama-3.3-70b-versatile"
    )

    response = chat_completion.choices[0].message.content.strip()

    # Parse response into titles + explanations
    ranked_titles = []
    explanations = []

    for line in response.split("\n"):
        if ". " in line:
            try:
                _, rest = line.split(". ", 1)
                title, explanation = rest.split(" - ", 1)
                ranked_titles.append(title.strip())
                explanations.append(explanation.strip())
            except ValueError:
                continue  # Skip any malformed lines

    reranked_df = movies_df.set_index("Title").loc[ranked_titles].reset_index()
    reranked_df["Explanation"] = explanations

    return reranked_df

In [62]:
#Lets do zero shot classification of the query to boost the search results
from transformers import pipeline
classifier = pipeline("zero-shot-classification",model="facebook/bart-large-mnli")

unique_genres=['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']

def predict_genres(query):
    result=classifier(query,unique_genres,multi_label=True)
    predicted = [label for label, score in zip(result["labels"], result["scores"]) if score > 0.4]
    return predicted

Device set to use mps:0


['Drama', 'Family', 'Romance', 'Western']

In [101]:
def retrieve_semantic_recommendations(query, top_k=10):
    # Predict genres from query
    predicted = predict_genres(query)
    print("Predicted ; ",predicted)

    # Vector similarity search
    results = chroma_db_movies.similarity_search(query, k=top_k)

    matched_titles = [doc.metadata['title'] for doc in results]
    matched_df = movies[movies['Title'].isin(matched_titles)].copy()

    #Rerank and get an apt explanation with llm
    final_df=rerank_and_explain_with_llm(query,matched_df,predicted)

    return final_df


In [103]:
retrieve_semantic_recommendations("A soldier returns home to find his family missing and uncovers a secret experiment in a remote village.",2)

Predicted ;  ['Adventure', 'Family', 'Mystery', 'Action', 'Science Fiction', 'Horror', 'Drama']


Unnamed: 0,Title,Release_Date,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url,Action,...,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,Explanation
0,Big Legend,2018-07-01,An ex-soldier ventures into the Pacific Northw...,15.966,113,6.5,en,"Horror, Action, Thriller",https://image.tmdb.org/t/p/original/9D4KB1x4r2...,1,...,1,0,0,0,0,0,1,0,0,This movie is a great fit because it's about a...
1,Dead Man's Shoes,2004-09-29,A soldier returns home to his small town and e...,13.376,401,7.2,en,"Drama, Thriller, Crime",https://image.tmdb.org/t/p/original/lI7dsNWL8U...,0,...,0,0,0,0,0,0,1,0,0,Although it doesn't involve a secret experimen...
