In [38]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
# Load CSV files
ratings_df = pd.read_csv('Resources/ratings.csv')
movies_df = pd.read_csv('Resources/movies.csv')
tags_df = pd.read_csv('Resources/tags.csv')
links_df = pd.read_csv('Resources/links.csv')
# Merge the ratings and movies dataframes on 'movieId'
merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')
# Display a sample of the data
print(ratings_df.head())
print(movies_df.head())
print(tags_df.head())
print(links_df.head())

# Define the Reader format (rating scale between 1 and 5)
reader = Reader(rating_scale=(1, 5))

# Load the dataset into Surprise format
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferre

In [39]:
from surprise.model_selection import train_test_split

# Split data into 80% training and 20% test
trainset, testset = train_test_split(data, test_size=0.2)


In [40]:

from surprise import SVD

# Create the SVD model
svd = SVD()

# Train the model on the training set
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b3f33bb5b0>

In [41]:
# Make predictions on the test set
predictions = svd.test(testset)

# Function to display predictions with movie titles
def display_predictions_with_titles(predictions, merged_df):
    for prediction in predictions:
        user = prediction[0]  # Accessing user ID (first element in tuple)
        movie_id = prediction[1]  # Accessing movie ID (second element in tuple)
        true_rating = prediction[2]  # Accessing true rating (third element in tuple)
        predicted_rating = prediction[3]  # Accessing predicted rating (fourth element in tuple)
        
        # Get the movie title from the merged dataframe
        movie_title = merged_df.loc[merged_df['movieId'] == movie_id, 'title'].values
        if len(movie_title) > 0:
            movie_title = movie_title[0]  # If movie exists in the dataset, get the title
        else:
            movie_title = "Unknown Movie"  # Fallback if no title is found
        
        # Print the prediction result
        print(f"User: {user}, Movie: {movie_title}, True Rating: {true_rating}, Predicted Rating: {predicted_rating}")

# Call the function to display predictions with movie titles
display_predictions_with_titles(predictions, merged_df)


User: 470, Movie: Speed (1994), True Rating: 4.0, Predicted Rating: 3.392624795564247
User: 160, Movie: Nightmare on Elm Street 4: The Dream Master, A (1988), True Rating: 1.0, Predicted Rating: 2.4557376911316777
User: 54, Movie: Get Shorty (1995), True Rating: 3.0, Predicted Rating: 2.7384639863765834
User: 105, Movie: Kin-Dza-Dza! (1986), True Rating: 4.0, Predicted Rating: 3.9561451584992957
User: 448, Movie: Lucy (2014), True Rating: 2.5, Predicted Rating: 2.1969309803959285
User: 128, Movie: Scream (1996), True Rating: 4.0, Predicted Rating: 3.5404113369756764
User: 318, Movie: Contagion (2011), True Rating: 4.0, Predicted Rating: 3.66880142853479
User: 298, Movie: Evolution (2001), True Rating: 2.0, Predicted Rating: 1.8890713695049484
User: 314, Movie: Two Much (1995), True Rating: 3.0, Predicted Rating: 3.015716675124303
User: 28, Movie: Curious Case of Benjamin Button, The (2008), True Rating: 4.5, Predicted Rating: 2.661165323839065
User: 52, Movie: Superbad (2007), True Rat

In [42]:
from surprise import accuracy

# Evaluate the model using RMSE
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

RMSE: 0.8686
Test RMSE: 0.8685883800125248


In [43]:
# Create a mapping from movieId to movie title
movie_title_mapping = pd.Series(movies_df['title'].values, index=movies_df['movieId']).to_dict()
# Function to get top-N recommendations for a given user, with movie names
def get_top_n(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and get the top-N movies
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    # Map movie IDs to titles and return the recommendations with titles
    top_n_with_titles = {}
    for uid, user_ratings in top_n.items():
        top_n_with_titles[uid] = [(movie_title_mapping.get(movie_id, "Unknown Movie"), rating) for movie_id, rating in user_ratings]
    
    return top_n_with_titles

# Get top-10 recommendations for each user
top_n = get_top_n(predictions, n=10)

# Print top-10 recommendations for a specific user (e.g., user 1)
user_id = 91  # You can change this to any user ID you want to see recommendations for
print(f"Top 10 recommendations for user {user_id}:")
for movie_name, rating in top_n[user_id]:
    print(f"Movie: {movie_name}, Predicted Rating: {rating}")


Top 10 recommendations for user 91:
Movie: Godfather, The (1972), Predicted Rating: 4.358391742435108
Movie: Hunt for Red October, The (1990), Predicted Rating: 4.342177563029525
Movie: Star Wars: Episode IV - A New Hope (1977), Predicted Rating: 4.298028378251754
Movie: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), Predicted Rating: 4.23302913890424
Movie: L.A. Confidential (1997), Predicted Rating: 4.21680457230235
Movie: Spider-Man (2002), Predicted Rating: 4.180459103688295
Movie: Casablanca (1942), Predicted Rating: 4.178208606544244
Movie: Godfather: Part II, The (1974), Predicted Rating: 4.110170285755558
Movie: One Flew Over the Cuckoo's Nest (1975), Predicted Rating: 4.081555359858418
Movie: Shawshank Redemption, The (1994), Predicted Rating: 4.06971077146553


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



# Convert genres into a single string for each movie (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english')
movies_df['genres'] = movies_df['genres'].fillna('')  # Fill missing genres

# Create the TF-IDF matrix for genres
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# Compute cosine similarity between movies based on genres
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations based on content similarity
def get_recommendations_content_based(movie_id, top_n=10):
    movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_movies = sim_scores[1:top_n+1]
    recommended_movie_ids = [movies_df['movieId'][i[0]] for i in top_movies]
    
    return recommended_movie_ids

# Example: Get top 10 movie recommendations for a specific movie based on content-based filtering
movie_id = 1
recommended_movies_content = get_recommendations_content_based(movie_id)
print("Content-Based Recommendations:", recommended_movies_content)


Content-Based Recommendations: [2294, 3114, 3754, 4016, 4886, 45074, 53121, 65577, 91355, 103755]


In [48]:
def get_hybrid_recommendations_with_true_ratings(user_id, movie_id, svd_model, ratings_df, top_n=10, weight=0.5):
    # CF-based recommendation (SVD)
    cf_prediction = svd_model.predict(user_id, movie_id).est
    
    # CBF-based recommendation (Content-based)
    recommended_movies_content = get_recommendations_content_based(movie_id, top_n)
    
    hybrid_recommendations = []
    true_ratings = []
    
    # Combine CF and CBF recommendations
    for recommended_movie in recommended_movies_content:
        cf_movie_rating = svd_model.predict(user_id, recommended_movie).est  # Predict using CF model
        hybrid_rating = weight * cf_movie_rating + (1 - weight) * cf_prediction
        
        # Append hybrid recommendation and true rating (from ratings_df)
        true_rating = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['movieId'] == recommended_movie)]['rating'].values
        
        if true_rating.size > 0:
            hybrid_recommendations.append((recommended_movie, hybrid_rating))
            true_ratings.append(true_rating[0])  # If the true rating exists
    
    # Sort by hybrid rating
    hybrid_recommendations.sort(key=lambda x: x[1], reverse=True)
    
    return hybrid_recommendations[:top_n], true_ratings



In [49]:
def get_recommendations_switching(user_id, movie_id, svd_model, threshold=5):
    # If user has rated enough movies, use CF
    if len(ratings_df[ratings_df['userId'] == user_id]) > threshold:
        # Use CF recommendation
        return get_hybrid_recommendations_with_true_ratings(user_id, movie_id, svd_model, ratings_df, top_n=10, weight=0.5)
    else:
        # Use CBF recommendation
        return get_recommendations_content_based(movie_id)


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error

def calculate_rmse_for_hybrid_system(svd_model, ratings_df, top_n=10, weight=0.5):
    all_predictions = []
    all_true_ratings = []
    
    # For each user in the ratings dataset, get hybrid recommendations and true ratings
    for user_id in ratings_df['userId'].unique():
        movie_id = ratings_df[ratings_df['userId'] == user_id]['movieId'].values[0]  # Get any movie ID
        
        # Get hybrid recommendations and true ratings
        hybrid_recommendations, true_ratings = get_hybrid_recommendations_with_true_ratings(
            user_id, movie_id, svd_model, ratings_df, top_n=top_n, weight=weight
        )
        
        # Collect predictions and true ratings
        for recommended_movie, predicted_rating in hybrid_recommendations:
            all_predictions.append(predicted_rating)
        
        all_true_ratings.extend(true_ratings)
    
    # Calculate RMSE between the predicted and actual ratings
    rmse = np.sqrt(mean_squared_error(all_true_ratings, all_predictions))
    return rmse

# Example usage:
rmse = calculate_rmse_for_hybrid_system(svd, ratings_df, top_n=10, weight=0.5)
print(f"Hybrid Model RMSE: {rmse}")


Hybrid Model RMSE: 0.8085992946898131
