In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import numpy as np


In [2]:
movies= pd.read_csv(r"C:\Users\Admin\OneDrive\Documents\Desktop\final project ip\movies.csv")   # Contains userId, movieId, rating, timestamp
ratings = pd.read_csv(r"C:\Users\Admin\OneDrive\Documents\Desktop\final project ip\ratings.csv")     # Contains movieId, title, genres

In [3]:
df = pd.merge(ratings, movies, on="movieId")
df = df.drop(columns=["timestamp"]).drop_duplicates()

# Unique movies for content-based
movies_df = df[['movieId', 'title', 'genres']].drop_duplicates(subset='movieId')


In [4]:
tfidf = TfidfVectorizer(stop_words='english')
movies_df['genres'] = movies_df['genres'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()


In [5]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

svd = SVD()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a36f406d10>

In [6]:
def hybrid_recommendations(user_id, movie_title, content_weight=0.5, n=10):
    # Content-based part
    idx = indices[movie_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:100]

    movie_indices = [i[0] for i in sim_scores]
    sim_dict = {movies_df.iloc[i]['movieId']: score for i, score in sim_scores}

    # Collaborative predictions
    hybrid_scores = []
    for movie_id, content_score in sim_dict.items():
        try:
            collab_score = svd.predict(user_id, movie_id).est
        except:
            collab_score = 3.0  # fallback
        final_score = content_weight * content_score + (1 - content_weight) * (collab_score / 5)
        hybrid_scores.append((movie_id, final_score))

    # Sort and return top N titles
    top_movies = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:n]
    top_titles = [movies_df[movies_df['movieId'] == mid]['title'].values[0] for mid, _ in top_movies]
    return top_titles


In [7]:
hybrid_recommendations(user_id=1, movie_title="Toy Story (1995)", content_weight=0.4)


['Monsters, Inc. (2001)',
 'Toy Story 2 (1999)',
 'Moana (2016)',
 'Inside Out (2015)',
 'Ponyo (Gake no ue no Ponyo) (2008)',
 "Kiki's Delivery Service (Majo no takkyûbin) (1989)",
 "Emperor's New Groove, The (2000)",
 'Turbo (2013)',
 'Shrek (2001)',
 'Antz (1998)']