In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Configurations
data_path = "/Users/rainfalls/Downloads/ml-32m/"

# Load Data
movies = pd.read_csv(data_path + "movies.csv")
tags = pd.read_csv(data_path + "tags.csv")

# Clean and prepare Tags
# Ensure all tags are strings and handle NaNs explicitly
tags['tag'] = tags['tag'].astype(str)
tags['tag'] = tags.groupby('movieId')['tag'].transform(lambda x: ' '.join(set(x)))
tags = tags[['movieId', 'tag']].drop_duplicates()

# Merge tags with movies
movies = movies.merge(tags, on='movieId', how='left')
# Replace NaNs in 'tag' with empty strings directly
movies['tag'] = movies['tag'].fillna('')

# Create a new column in movies DataFrame to hold combined descriptions
movies['description'] = movies['genres'] + " " + movies['tag']

# Print some data to check everything loaded correctly
print(movies.head())

# Initialize Sentence Transformer Model and generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
movie_descriptions = movies['description'].tolist()
embeddings = model.encode(movie_descriptions, normalize_embeddings=True)

# Save embeddings for later use (optional)
np.save(data_path + "movie_embeddings.npy", embeddings)

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Function to find similar movies based on cosine similarity
def get_similar_movies(movie_title, top_n=5):
    try:
        movie_idx = movies[movies['title'] == movie_title].index[0]
    except IndexError:
        return f"No movie found with title {movie_title}"
    
    sim_scores = list(enumerate(similarity_matrix[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    top_similar_indices = [i[0] for i in sim_scores]
    similar_movies = movies.iloc[top_similar_indices][['title', 'genres']]
    return similar_movies

# Example usage: Get top 5 similar movies for a given movie
example_movie = 'Toy Story (1995)'  # Change this to any movie title in the dataset
similar_movies = get_similar_movies(example_movie, 5)
print(f"Movies similar to {example_movie}:")
print(similar_movies)

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                                 tag  \
0  girl want to see again ventilation shaft light...   
1  girl driving a car into a building fish out of...   
2  Minnesota old age moldy Walter Matthau duringc...   
3  divorce chick flick single mother based on nov...   
4  parent child relationship seen more than once ...   

                                         description  
0  Adv

In [None]:
# Prompt user to enter a movie title
user_movie_input = input("Enter a movie title to find similar movies: ")
print(f"User entered: {user_movie_input}")

similar_movies = get_similar_movies(user_movie_input, 5)

print(f"Movies similar to '{user_movie_input}':")
print(similar_movies)