In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

In [2]:
data_ratings = pd.read_csv('ratings.csv')
data_movies = pd.read_csv('movies.csv')
data_links = pd.read_csv('links.csv')
data_tags = pd.read_csv('tags.csv')

In [3]:
ratings_matrix = data_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

In [4]:
ratings_matrix_centered = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)

user_similarity_pearson = cosine_similarity(ratings_matrix_centered)

In [5]:
user_similarity_df_pearson = pd.DataFrame(user_similarity_pearson, index=ratings_matrix.index, columns=ratings_matrix.index)

# Similarly, center the data for item-item similarity
ratings_matrix_centered_items = ratings_matrix.T.sub(ratings_matrix.mean(axis=0), axis=0)

# Calculate the Pearson (centered cosine) similarity between items
item_similarity_pearson = cosine_similarity(ratings_matrix_centered_items)

# Create a DataFrame for item similarity
item_similarity_df_pearson = pd.DataFrame(item_similarity_pearson, index=ratings_matrix.columns, columns=ratings_matrix.columns)


In [6]:
def predict_user_based(user_id, movie_id, user_similarity_df, ratings_matrix, nNeighbours=10):
    # Get the top nNeighbours most similar users to the target user
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:nNeighbours+1]
    
    weighted_ratings = 0
    total_similarity = 0
    
    for similar_user, similarity in zip(similar_users.index, similar_users):
        if ratings_matrix.loc[similar_user, movie_id] > 0:  # Only consider rated movies
            weighted_ratings += similarity * ratings_matrix.loc[similar_user, movie_id]
            total_similarity += similarity
    
    if total_similarity == 0:
        return 0  # Return 0 if no similar users rated the movie
    return weighted_ratings / total_similarity

In [7]:
def predict_item_based(user_id, movie_id, item_similarity_df, ratings_matrix, nNeighbours=10):
    # Get the top nNeighbours most similar items to the target movie
    similar_items = item_similarity_df[movie_id].sort_values(ascending=False)[1:nNeighbours+1]
    
    weighted_ratings = 0
    total_similarity = 0
    
    for similar_item, similarity in zip(similar_items.index, similar_items):
        if ratings_matrix.loc[user_id, similar_item] > 0:  # Only consider rated movies
            weighted_ratings += similarity * ratings_matrix.loc[user_id, similar_item]
            total_similarity += similarity
    
    if total_similarity == 0:
        return 0  # Return 0 if no similar items are rated
    return weighted_ratings / total_similarity

In [8]:
def recommend_user_based(user_id, ratings_matrix, user_similarity_df, nNeighbours=10, topN=10):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).head(nNeighbours)
    predicted_ratings = {}
    
    for movie_id in ratings_matrix.columns:
        if ratings_matrix.loc[user_id, movie_id] > 0:
            continue
        
        similar_users_ratings = ratings_matrix.loc[similar_users.index, movie_id]
        weighted_sum = (similar_users_ratings * similar_users).sum()
        normalization_factor = similar_users.sum()
        
        if normalization_factor > 0:
            predicted_ratings[movie_id] = weighted_sum / normalization_factor
        else:
            predicted_ratings[movie_id] = 0
    
    recommended_movie_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:topN]
    recommended_movie_titles = data_movies[data_movies['movieId'].isin(recommended_movie_ids)]['title'].tolist()
    
    return recommended_movie_titles

In [9]:
def recommend_item_based(user_id, ratings_matrix, item_similarity_df, nNeighbours=10, topN=10):
    rated_movies = ratings_matrix.loc[user_id][ratings_matrix.loc[user_id] > 0].index.tolist()
    predicted_ratings = {}
    
    for movie_id in ratings_matrix.columns:
        if movie_id in rated_movies:
            continue
        
        similar_items = item_similarity_df[movie_id].sort_values(ascending=False).head(nNeighbours)
        weighted_sum = sum(similar_items * ratings_matrix.loc[user_id, similar_items.index])
        normalization_factor = similar_items.sum()
        
        if normalization_factor > 0:
            predicted_ratings[movie_id] = weighted_sum / normalization_factor
        else:
            predicted_ratings[movie_id] = 0
    
    recommended_movie_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:topN]
    recommended_movie_titles = data_movies[data_movies['movieId'].isin(recommended_movie_ids)]['title'].tolist()
    
    return recommended_movie_titles

In [10]:
recommended_movies_user_based = recommend_user_based(user_id=1, ratings_matrix=ratings_matrix, 
                                                    user_similarity_df=user_similarity_df_pearson, nNeighbours=10, topN=5)
print("Recommended Movies (User-Based):")
for movie in recommended_movies_user_based:
    print(movie)

Recommended Movies (User-Based):
Terminator 2: Judgment Day (1991)
Godfather, The (1972)
Aliens (1986)
Hunt for Red October, The (1990)
Sixth Sense, The (1999)


In [11]:
recommended_movies_item_based = recommend_item_based(user_id=1, ratings_matrix=ratings_matrix, 
                                                    item_similarity_df=item_similarity_df_pearson, nNeighbours=10, topN=5)
print("Recommended Movies (Item-Based):")
for movie in recommended_movies_item_based:
    print(movie)

Recommended Movies (Item-Based):
Trainspotting (1996)
Die Hard (1988)
Aliens (1986)
Jaws (1975)
Sixth Sense, The (1999)


In [12]:
def calculate_mae(predicted_ratings, actual_ratings):
    return np.mean(np.abs(predicted_ratings - actual_ratings))

# Predict ratings for a user-item pair
user_id = 1  # Example user
movie_ids = ratings_matrix.columns  # List of all movie ids

actual_ratings = ratings_matrix.loc[user_id, movie_ids]  # Actual ratings for this user
predicted_ratings_user_based = np.array([predict_user_based(user_id, movie_id, user_similarity_df_pearson, ratings_matrix) for movie_id in movie_ids])
predicted_ratings_item_based = np.array([predict_item_based(user_id, movie_id, item_similarity_df_pearson, ratings_matrix) for movie_id in movie_ids])

# Calculate MAE
mae_user_based = calculate_mae(predicted_ratings_user_based, actual_ratings)
mae_item_based = calculate_mae(predicted_ratings_item_based, actual_ratings)

print(f"MAE for User-Based Recommendation: {mae_user_based}")
print(f"MAE for Item-Based Recommendation: {mae_item_based}")


MAE for User-Based Recommendation: 0.47286898353299195
MAE for Item-Based Recommendation: 0.23991720171886272


In [13]:
def calculate_rmse(predicted_ratings, actual_ratings):
    return np.sqrt(np.mean((predicted_ratings - actual_ratings) ** 2))

# Calculate RMSE
rmse_user_based = calculate_rmse(predicted_ratings_user_based, actual_ratings)
rmse_item_based = calculate_rmse(predicted_ratings_item_based, actual_ratings)

print(f"RMSE for User-Based Recommendation: {rmse_user_based}")
print(f"RMSE for Item-Based Recommendation: {rmse_item_based}")

RMSE for User-Based Recommendation: 1.302580733172758
RMSE for Item-Based Recommendation: 1.0016021465257543
