In [41]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [42]:
# Load data
movies = pd.read_csv('../data/movies.csv')
tags = pd.read_csv('../data/tags.csv')

# 1. Clean tags: Lowercase and group them by movieId
tags['tag'] = tags['tag'].astype(str).str.lower()
# Join all tags for a movie into one string
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()



In [43]:
# 2. Merge tags with movies
movies = pd.merge(movies, movie_tags, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('') # Handle movies with no tags



In [44]:
# 3. Create the "Ultimate Soup"
# We combine Title + Genres + User Tags
movies['metadata'] = movies['title'] + " " + movies['genres'].str.replace('|', ' ') + " " + movies['tag']
movies['metadata'] = movies['metadata'].str.lower()

print("Sample metadata for movie 1:")
print(movies.iloc[0]['metadata'])

Sample metadata for movie 1:
toy story (1995) adventure animation children comedy fantasy children disney animation children disney disney pixar animation funny pixar pixar tumey's to see again tumey's vhs adventure classic friendship funny animation computer animation pixar toys adventure computer animation pixar adventure animated animation clever comedy computer animation family fantasy tom hanks pixar tom hanks animation pixar 3d animated children comedy computer animation disney family humorous pixar time travel tom hanks accepting reality emotional friendship funny soundtrack children computer animation disney family pixar computer animation pixar ad for toys children clever forced puns  internal logic fails mixed soundtrack reflection rivalry technological marvel pixar kids and family funny friendship animation cartoon pixar adventure animated animation buddy movie comedy computer animation disney friendship funny humorous pixar tom hanks witty animation pixar animated animation

By using max_features, we tell the model to only care about the 5,000 most important words (tags/genres). This ignores "junk" tags and focuses on the high-signal keywords.

In [45]:
# Use CountVectorizer to turn the 'metadata' soup into numbers
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(movies['metadata'])

# Check the shape (Movies, 5000 keywords)
print(vector.shape)

(87585, 5000)


Recommendation Engine (10 Movies)

In [47]:
def get_recommendations(movie_title):
    try:
        # Find index
        idx = movies[movies['title'] == movie_title].index[0]
        
        # Calculate similarity
        distances = cosine_similarity(vector[idx], vector).flatten()
        
        # Get top 10 (excluding the input movie itself at index 0)
        movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]
        
        print(f"Recommendations for '{movie_title}':\n")
        for i in movie_list:
            print(f"ðŸŽ¬ {movies.iloc[i[0]].title} | Genres: {movies.iloc[i[0]].genres}")
            
    except IndexError:
        print("Movie not found. Check the exact spelling/year.")



In [48]:
# TEST: Now try 'Avengers, The (2012)'
get_recommendations('Avengers, The (2012)')

Recommendations for 'Avengers, The (2012)':

ðŸŽ¬ Iron Man 2 (2010) | Genres: Action|Adventure|Sci-Fi|Thriller|IMAX
ðŸŽ¬ Iron Man (2008) | Genres: Action|Adventure|Sci-Fi
ðŸŽ¬ Avengers: Infinity War - Part I (2018) | Genres: Action|Adventure|Sci-Fi
ðŸŽ¬ Iron Man 3 (2013) | Genres: Action|Sci-Fi|Thriller|IMAX
ðŸŽ¬ Avengers: Age of Ultron (2015) | Genres: Action|Adventure|Sci-Fi
ðŸŽ¬ Captain America: Civil War (2016) | Genres: Action|Sci-Fi|Thriller
ðŸŽ¬ Captain America: The Winter Soldier (2014) | Genres: Action|Adventure|Sci-Fi|IMAX
ðŸŽ¬ Thor: The Dark World (2013) | Genres: Action|Adventure|Fantasy|IMAX
ðŸŽ¬ Avengers: Infinity War - Part II (2019) | Genres: Action|Adventure|Sci-Fi
ðŸŽ¬ Captain America: The First Avenger (2011) | Genres: Action|Adventure|Sci-Fi|Thriller|War


Save the updated files/model


In [49]:
pickle.dump(movies, open('../model/movies_list.pkl', 'wb'))
pickle.dump(vector, open('../model/vector_matrix.pkl', 'wb'))