In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. LOAD AND PREPARE DATA
def load_and_prepare_data(csv_file="cleaned_movies.csv"):
    movie_tags = pd.read_csv(csv_file)
    
    print(f"Loaded {len(movie_tags)} movies")
    return movie_tags

# 2. CREATE CONTENT FEATURES
def create_content_features(movie_tags):
    movie_tags['content'] = movie_tags['content'].str.strip()
    movie_tags['content'] = movie_tags['content'].replace('', 'unknown')
    return movie_tags

# 3. BUILD SIMILARITY MATRIX
def build_similarity_matrix(movie_tags):
    tfidf = TfidfVectorizer(
        stop_words='english',
        max_features=10000, 
        ngram_range=(1, 2)
    )
    
    tfidf_matrix = tfidf.fit_transform(movie_tags['content'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return tfidf, tfidf_matrix, cosine_sim

# 4. CREATE MAPPING INDICES
def create_indices(movie_tags):
    indices = pd.Series(movie_tags.index, index=movie_tags['title']).drop_duplicates()
    print(f"Created indices for {len(indices)} unique movie titles")
    return indices

# 5. RECOMMENDATION FUNCTIONS
def recommend_by_title(title, movie_tags, cosine_sim, indices, top_n=5):
    if title not in indices:
        available_titles = list(indices.index)[:10]
        return f"Movie '{title}' not found in database.\nTry one of these: {available_titles}"
    
    # Get the index of the movie
    idx = indices[title]
    
    # Get similarity scores for this movie with all others
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Convert to DataFrame for easier manipulation
    sims_df = pd.DataFrame(sim_scores, columns=['index', 'similarity'])
    
    # Remove the input movie itself
    sims_df = sims_df[sims_df['index'] != idx]
    
    # Add movie information
    sims_df = sims_df.merge(movie_tags.reset_index(), left_on='index', right_on='index')
    
    # Sort by: similarity (desc), then rating (desc), then title (asc) for consistency
    sims_df = sims_df.sort_values(
        by=['similarity', 'rating', 'title'],
        ascending=[False, False, True]
    )
    
    return sims_df[['title', 'content', 'rating', 'similarity']].head(top_n)

def recommend_by_text(user_input, movie_tags, tfidf, tfidf_matrix, top_n=5):
    if not user_input.strip():
        return "Please provide some keywords (e.g., 'action adventure', 'romantic comedy')"
    
    # Transform user input using the same TF-IDF vectorizer
    input_vector = tfidf.transform([user_input])
    
    # Calculate similarity between input and all movies
    sim_scores = cosine_similarity(input_vector, tfidf_matrix).flatten()
    
    # Create DataFrame with similarities
    sims_df = pd.DataFrame({
        'index': range(len(sim_scores)),
        'similarity': sim_scores
    })
    
    # Filter out movies with very low similarity (< 0.1)
    sims_df = sims_df[sims_df['similarity'] > 0.1]
    
    if len(sims_df) == 0:
        return f"No movies found matching '{user_input}'. Try different keywords."
    
    # Add movie information
    sims_df = sims_df.merge(movie_tags.reset_index(), left_on='index', right_on='index')
    
    # Sort by similarity (desc), then rating (desc), then title (asc)
    sims_df = sims_df.sort_values(
        by=['similarity', 'rating', 'title'],
        ascending=[False, False, True]
    )
    
    return sims_df[['title', 'content', 'rating', 'similarity']].head(top_n)

def recommend_hybrid(title=None, keywords=None, movie_tags=None, cosine_sim=None, 
                    indices=None, tfidf=None, tfidf_matrix=None, top_n=5):
    recommendations = []
    
    if title and title in indices:
        title_recs = recommend_by_title(title, movie_tags, cosine_sim, indices, top_n)
        if isinstance(title_recs, pd.DataFrame):
            title_recs['source'] = 'title_based'
            recommendations.append(title_recs)
    
    if keywords:
        keyword_recs = recommend_by_text(keywords, movie_tags, tfidf, tfidf_matrix, top_n)
        if isinstance(keyword_recs, pd.DataFrame):
            keyword_recs['source'] = 'keyword_based'
            recommendations.append(keyword_recs)
    
    if not recommendations:
        return "No recommendations found"
    
    # Combine and deduplicate
    combined = pd.concat(recommendations, ignore_index=True)
    combined = combined.drop_duplicates(subset=['title']).head(top_n)
    
    return combined

# 6. MAIN RECOMMENDATION SYSTEM CLASS
class MovieRecommendationSystem:    
    def __init__(self, csv_file="cleaned_movies.csv"):
        print("Initializing Movie Recommendation System...")
        
        # Load and prepare data
        self.movie_tags = load_and_prepare_data(csv_file)
        self.movie_tags = create_content_features(self.movie_tags)
        
        # Build similarity matrix
        self.tfidf, self.tfidf_matrix, self.cosine_sim = build_similarity_matrix(self.movie_tags)
        
        # Create indices
        self.indices = create_indices(self.movie_tags)
        
        print("Recommendation system ready!")
        print(f"Available methods: title-based, keyword-based, hybrid, search-based")
    
    def recommend_by_title(self, title, top_n=5):
        return recommend_by_title(title, self.movie_tags, self.cosine_sim, self.indices, top_n)
    
    def recommend_by_keywords(self, keywords, top_n=5):
        return recommend_by_text(keywords, self.movie_tags, self.tfidf, self.tfidf_matrix, top_n)
    
    def recommend_hybrid(self, title=None, keywords=None, top_n=5):
        return recommend_hybrid(title, keywords, self.movie_tags, self.cosine_sim, 
                              self.indices, self.tfidf, self.tfidf_matrix, top_n)
    
    def get_movie_info(self, title):
        if title not in self.indices:
            return f"Movie '{title}' not found"
        
        idx = self.indices[title]
        movie = self.movie_tags.iloc[idx]
        
        return {
            'title': movie['title'],
            'content': movie['content'],
            'rating': movie['rating'],
            'tags': movie['tag'],
        }
    
    def search_movies(self, query, max_results=10):
        matches = self.movie_tags[
            self.movie_tags['title'].str.contains(query, case=False, na=False)
        ]
        return matches[['title', 'content', 'rating']].head(max_results)

In [2]:
# Initialize once
recommender = MovieRecommendationSystem("cleaned_movies.csv")

# Method 1: By title
recommender.recommend_by_title("Toy Story (1995)")


Initializing Movie Recommendation System...
Loaded 9742 movies
Created indices for 9742 unique movie titles
Recommendation system ready!
Available methods: title-based, keyword-based, hybrid, search-based


Unnamed: 0,title,content,rating,similarity
7759,Asterix and the Vikings (Astérix et les Viking...,Adventure Animation Children Comedy Fantasy,5.0,0.54467
3567,"Monsters, Inc. (2001)",Adventure Animation Children Comedy Fantasy,3.87,0.54467
2999,"Emperor's New Groove, The (2000)",Adventure Animation Children Comedy Fantasy,3.72,0.54467
9429,Moana (2016),Adventure Animation Children Comedy Fantasy,3.45,0.54467
1705,Antz (1998),Adventure Animation Children Comedy Fantasy,3.24,0.54467


In [11]:
# Method 2: By keywords  
recommender.recommend_by_keywords("action")

Unnamed: 0,title,content,rating,similarity
244,Knock Off (1998),Action,5.0,1.0
579,"Big Bird Cage, The (1972)",Action,4.5,1.0
782,Master of the Flying Guillotine (Du bi quan wa...,Action,4.5,1.0
503,Game of Death (1978),Action,4.17,1.0
1280,13 Assassins (Jûsan-nin no shikaku) (2010),Action,4.0,1.0


In [7]:
# Method 3: Hybrid
recommender.recommend_hybrid(title="Toy story", keywords="fun")

Unnamed: 0,title,content,rating,similarity,source
0,Guardians of the Galaxy 2 (2017),Action Adventure Sci-Fi fun,3.93,0.522415,keyword_based
1,Toy Story (1995),Adventure Animation Children Comedy Fantasy fu...,3.92,0.385292,keyword_based
2,Big Hero 6 (2014),Action Animation Comedy animation feel-good fu...,3.85,0.219544,keyword_based
3,The Lego Movie (2014),Action Adventure Animation Children Comedy Fan...,3.87,0.217508,keyword_based
4,"Avengers, The (2012)",Action Adventure Sci-Fi IMAX Captain America f...,3.87,0.188793,keyword_based


In [5]:
# Method 3: Search movies
recommender.search_movies("star wars")

Unnamed: 0,title,content,rating
224,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi EPIC Nerd ROBOTS AND A...,4.23
898,Star Wars: Episode V - The Empire Strikes Back...,Action Adventure Sci-Fi George Lucas Harrison ...,4.22
911,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure Sci-Fi darth vader luke skywa...,4.14
1979,Star Wars: Episode I - The Phantom Menace (1999),Action Adventure Sci-Fi prequel the Force,3.11
3832,Star Wars: Episode II - Attack of the Clones (...,Action Adventure Sci-Fi IMAX,3.16
5896,Star Wars: Episode III - Revenge of the Sith (...,Action Adventure Sci-Fi space space opera,3.43
6823,Star Wars: The Clone Wars (2008),Action Adventure Animation Sci-Fi,2.36
7367,Empire of Dreams: The Story of the 'Star Wars'...,Documentary,4.5
8683,Star Wars: Episode VII - The Force Awakens (2015),Action Adventure Fantasy Sci-Fi IMAX,3.85
8908,The Star Wars Holiday Special (1978),Adventure Children Comedy Sci-Fi,0.5
