<a href="https://colab.research.google.com/github/nebyathhailu/movie-recommendation-agent/blob/main/Movie_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install pandas transformers torch langchain langchain-community nltk chromadb tqdm

Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chr

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
from typing import List
from tqdm import tqdm
import re
import os
import nltk

# LangChain components
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# nltk for lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

# Transformers for embeddings and LLM
from transformers import AutoTokenizer, AutoModel, pipeline

# ChromaDB for vector storage
import chromadb
from chromadb.config import Settings

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Load dataset
dataset_path = '/content/drive/MyDrive/imdb_top_1000.csv'
if not os.path.exists(dataset_path):
    print(f"Dataset not found at {dataset_path}. Please upload it.")

movies = pd.read_csv(dataset_path)
movies['movie_id'] = movies.index.astype(str)
print(f"Loaded {len(movies)} movies from the dataset.")

Loaded 1000 movies from the dataset.


In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    """Tokenizes, lemmatizes, and joins text back into a string."""
    if pd.isna(text):
        return ""
    tokens = word_tokenize(text.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

print("\nStep 2: Preprocessing, Lemmatizing, and Chunking Overviews...")

# Download the punkt_tab resource
nltk.download('punkt_tab')

# Apply lemmatization to the 'Overview' column
movies['Lemmatized_Overview'] = movies['Overview'].apply(lemmatize_text)

# Initialize the text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

# Prepare lists for our chunked data
chunked_texts = []
chunked_ids = []
chunked_metadatas = []

for index, row in movies.iterrows():
    lemmatized_overview = row['Lemmatized_Overview']
    if not lemmatized_overview:
        continue

    chunks = text_splitter.split_text(lemmatized_overview)

    for i, chunk in enumerate(chunks):
        chunk_id = f"{row['movie_id']}_chunk_{i}"

        # Store the original, non-lemmatized metadata for explanations
        metadata = {
            "movie_id": row['movie_id'],
            "Series_Title": row['Series_Title'],
            "Director": row['Director'],
            "Genre": row['Genre'],
            "IMDB_Rating": row['IMDB_Rating'],
            "Meta_score": row['Meta_score'],
            "No_of_Votes": row['No_of_Votes'],
            "Released_Year": row['Released_Year'],
            "Full_MetaText": f"Title: {row['Series_Title']}\nDirector: {row['Director']}\nGenre: {row['Genre']}\nPlot: {row['Overview']}\nStars: {row['Star1']}, {row['Star2']}\nYear: {row['Released_Year']}\nRating: {row['IMDB_Rating']}"
        }

        chunked_texts.append(chunk)
        chunked_ids.append(chunk_id)
        chunked_metadatas.append(metadata)

print(f"Created {len(chunked_texts)} lemmatized chunks from {len(movies)} movies.")


Step 2: Preprocessing, Lemmatizing, and Chunking Overviews...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Created 1000 lemmatized chunks from 1000 movies.


In [None]:

# --- STEP 3: EMBEDDINGS AND CHROMADB STORAGE ---

# Define the E5 Embedding Wrapper (no changes needed here)
class E5EmbeddingWrapper:
    def __init__(self, model_name="intfloat/multilingual-e5-small"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        print(f"Embedding model loaded on device: {self.device}")

    def embed_query(self, text: str) -> List[float]:
        inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].cpu().numpy().tolist()[0]

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.embed_query(text) for text in texts]

    def __call__(self, text: str) -> List[float]:
        return self.embed_query(text)

# Initialize the embedding model
print("\nStep 3: Initializing Embedding Model and Connecting to ChromaDB...")
embeddings = E5EmbeddingWrapper()

# Connect to ChromaDB (using a new collection for lemmatized data)
collection_name = "movie_chunks_lemmatized"
persist_directory = "/content/drive/MyDrive/chroma_db_lemmatized"

try:
    vector_store = Chroma(
        collection_name=collection_name,
        persist_directory=persist_directory,
        embedding_function=embeddings
    )
    if vector_store._collection.count() == 0:
        print("Collection is empty. Generating embeddings for lemmatized chunks...")
        # Chroma's add_texts handles embedding internally if an embedding function is provided
        vector_store.add_texts(
            texts=chunked_texts,
            metadatas=chunked_metadatas,
            ids=chunked_ids
        )
        print(f"Successfully added {len(chunked_texts)} lemmatized chunk embeddings to ChromaDB.")
    else:
        print(f"Using existing lemmatized collection with {vector_store._collection.count()} items.")
except Exception as e:
    print(f"Error with ChromaDB: {e}")
    raise



Step 3: Initializing Embedding Model and Connecting to ChromaDB...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Embedding model loaded on device: cpu


  vector_store = Chroma(


Using existing lemmatized collection with 1000 items.


In [None]:

# --- STEP 4: SETUP LLM AND PROMPT ---

print("\nStep 4: Setting up LLM and Prompt Template...")
# Initialize LLM
try:
    llm_pipeline = pipeline(
        "text-generation",
        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        torch_dtype=torch.bfloat16,
        device_map="auto",
        model_kwargs={"temperature": 0.4, "max_length": 512}
    )
    llm = HuggingFacePipeline(pipeline=llm_pipeline)
    print("TinyLlama LLM initialized.")
except Exception as e:
    print(f"Could not load TinyLlama LLM ({e}). Using a placeholder LLM.")
    from langchain_core.language_models import BaseLLM
    class MockLLM(BaseLLM):
        def _call(self, prompt: str, stop=None) -> str:
            if "specifically for" in prompt:
                movie_title_match = re.search(r"specifically for '([^']*)'", prompt)
                movie_title = movie_title_match.group(1) if movie_title_match else "a movie"
                return (f"Explanation for '{movie_title}': This movie aligns with your query due to its compelling "
                        f"plot, acclaimed director, and strong performances by its lead actors. It shares thematic "
                        f"elements and a similar narrative style, making it a great fit for your taste.")
            return f"Mock LLM response for: {prompt[:200]}..."
        @property
        def _llm_type(self) -> str:
            return "mock_llm"
    llm = MockLLM()

# Define prompt template
prompt_template = """Analyze this movie recommendation context:
{context}

Based on the user's request: "{query}", generate a personalized recommendation explaining:
1. Genre alignment
2. Director/style connections
3. Star actor relevance
4. Plot similarities
Provide a concise explanation for each point.
"""
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "query"]
)


Step 4: Setting up LLM and Prompt Template...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Device set to use cpu


TinyLlama LLM initialized.


  llm = HuggingFacePipeline(pipeline=llm_pipeline)


In [None]:

# --- STEP 5: ADVANCED RECOMMENDER CLASS ---

class AdvancedRecommender:
    def __init__(self, vector_store, llm, prompt_template):
        self.store = vector_store
        self.llm = llm
        self.prompt_template = prompt_template
        self.rating_weights = {'IMDB_Rating': 0.6, 'Meta_score': 0.3, 'No_of_Votes': 0.1}

    def _hybrid_score(self, movie_metadata):
        score = 0
        for col, weight in self.rating_weights.items():
            if col in movie_metadata and pd.notna(movie_metadata[col]):
                if col == 'No_of_Votes':
                    normalized_value = torch.log(torch.tensor(movie_metadata[col] + 1)).item() / 10
                else:
                    normalized_value = movie_metadata[col] / 10
                score += normalized_value * weight
        return score

    def recommend(self, query: str, top_n: int = 5):
        # Lemmatize the user's query before searching
        lemmatized_query = lemmatize_text(query)
        print(f"Original query: '{query}'")
        print(f"Lemmatized query: '{lemmatized_query}'")
        print(f"Searching for chunk candidates...")

        retrieved_chunks = self.store.similarity_search(lemmatized_query, k=top_n * 5)

        unique_movies = {}
        for chunk in retrieved_chunks:
            movie_id = chunk.metadata['movie_id']
            if movie_id not in unique_movies:
                unique_movies[movie_id] = chunk

        sorted_movies = sorted(
            unique_movies.values(),
            key=lambda doc: self._hybrid_score(doc.metadata),
            reverse=True
        )[:top_n]

        print(f"Found {len(retrieved_chunks)} relevant chunks, mapping to {len(unique_movies)} unique movies.")
        print(f"Re-ranked and selected top {len(sorted_movies)} movies.")

        explanations = []
        for doc in sorted_movies:
            movie_title = doc.metadata.get('Series_Title', 'Unknown Title')
            try:
                # Use the original, non-lemmatized context for the LLM explanation
                full_context = doc.metadata.get('Full_MetaText', doc.page_content)

                # Use the original query in the prompt for a natural-sounding explanation
                formatted_prompt = self.prompt_template.format(
                    context=full_context,
                    query=query
                )
                explanation = self.llm.invoke(formatted_prompt)
            except Exception as e:
                explanation = f"Could not generate detailed explanation: {e}"
                print(f"Warning: Failed to generate explanation for {movie_title}. Error: {e}")

            explanations.append({
                'title': movie_title,
                'year': doc.metadata.get('Released_Year', 'N/A'),
                'rating': doc.metadata.get('IMDB_Rating', 'N/A'),
                'explanation': explanation
            })

        return explanations

In [None]:
# --- FINAL INITIALIZATION AND TESTING ---
print("\nStep 5: Initializing the Final Advanced Recommender...")
recommender = AdvancedRecommender(vector_store, llm, PROMPT)
print("Recommender is ready!")

# --- Example Usage ---
print("\n--- Testing with a query that benefits from lemmatization ---")
user_query = "A movie about a group of friends who go on an adventure"
recommendations = recommender.recommend(user_query, top_n=3)

for i, rec in enumerate(recommendations):
    print(f"\n{i+1}. {rec['title']} ({rec['year']}) - IMDB Rating: {rec['rating']}")
    print(f"   Explanation: {rec['explanation']}")


Step 5: Initializing the Final Advanced Recommender...
Recommender is ready!

--- Testing with a query that benefits from lemmatization ---
Original query: 'A movie about a group of friends who go on an adventure'
Lemmatized query: 'a movie about a group of friend who go on an adventure'
Searching for chunk candidates...
Found 15 relevant chunks, mapping to 15 unique movies.
Re-ranked and selected top 3 movies.

1. 8½ (1963) - IMDB Rating: 8.0
   Explanation: Analyze this movie recommendation context:
Title: 8½
Director: Federico Fellini
Genre: Drama
Plot: A harried movie director retreats into his memories and fantasies.
Stars: Marcello Mastroianni, Anouk Aimée
Year: 1963
Rating: 8.0

Based on the user's request: "A movie about a group of friends who go on an adventure", generate a personalized recommendation explaining:
1. Genre alignment
2. Director/style connections
3. Star actor relevance
4. Plot similarities
Provide a concise explanation for each point.

Title: The Great Gatsby
