In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt

In [2]:
data_ratings = pd.read_csv('ratings.csv')
data_movies = pd.read_csv('movies.csv')

In [3]:
ratings_matrix = data_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)


In [4]:
U, Sigma, Vt = np.linalg.svd(ratings_matrix, full_matrices=False)
k = 20  # Number of latent factors
U_k = U[:, :k]
Sigma_k = np.diag(Sigma[:k])
Vt_k = Vt[:k, :]

In [5]:
ratings_matrix_approx = np.dot(np.dot(U_k, Sigma_k), Vt_k)

# Convert the predicted ratings into a pandas DataFrame for convenience
predicted_ratings_svd = pd.DataFrame(ratings_matrix_approx, index=ratings_matrix.index, columns=ratings_matrix.columns)


In [6]:
ratings_matrix_centered = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)
user_similarity_pearson = cosine_similarity(ratings_matrix_centered)
user_similarity_df_pearson = pd.DataFrame(user_similarity_pearson, index=ratings_matrix.index, columns=ratings_matrix.index)

In [7]:
ratings_matrix_centered_items = ratings_matrix.T.sub(ratings_matrix.mean(axis=0), axis=0)
item_similarity_pearson = cosine_similarity(ratings_matrix_centered_items)
item_similarity_df_pearson = pd.DataFrame(item_similarity_pearson, index=ratings_matrix.columns, columns=ratings_matrix.columns)

In [8]:
def content_based_similarity(data_movies):
    # One-hot encode genres
    mlb = MultiLabelBinarizer()
    genres = data_movies['genres'].str.split('|')
    genre_matrix = mlb.fit_transform(genres)
    
    # Calculate the cosine similarity between movies based on their genres
    content_similarity = cosine_similarity(genre_matrix)
    
    return pd.DataFrame(content_similarity, index=data_movies['movieId'], columns=data_movies['movieId'])

# Calculate content-based similarity
content_similarity_df = content_based_similarity(data_movies)

In [9]:
def predict_user_based(user_id, movie_id, user_similarity_df, ratings_matrix, nNeighbours=10):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:nNeighbours+1]
    
    weighted_ratings = 0
    total_similarity = 0
    
    for similar_user, similarity in zip(similar_users.index, similar_users):
        if ratings_matrix.loc[similar_user, movie_id] > 0:  # Only consider rated movies
            weighted_ratings += similarity * ratings_matrix.loc[similar_user, movie_id]
            total_similarity += similarity
    
    if total_similarity == 0:
        return 0  # Return 0 if no similar users rated the movie
    return weighted_ratings / total_similarity

In [10]:
def predict_item_based(user_id, movie_id, item_similarity_df, ratings_matrix, nNeighbours=10):
    similar_items = item_similarity_df[movie_id].sort_values(ascending=False)[1:nNeighbours+1]
    
    weighted_ratings = 0
    total_similarity = 0
    
    for similar_item, similarity in zip(similar_items.index, similar_items):
        if ratings_matrix.loc[user_id, similar_item] > 0:  # Only consider rated movies
            weighted_ratings += similarity * ratings_matrix.loc[user_id, similar_item]
            total_similarity += similarity
    
    if total_similarity == 0:
        return 0  # Return 0 if no similar items are rated
    return weighted_ratings / total_similarity

In [11]:
def predict_svd(user_id, movie_id, predicted_ratings_svd):
    return predicted_ratings_svd.loc[user_id, movie_id]

In [12]:
def hybrid_recommendation(user_id, ratings_matrix, user_similarity_df, item_similarity_df, predicted_ratings_svd, content_similarity_df, topN=10, weights=[1, 1, 1, 1]):
    predicted_ratings = []
    
    for movie_id in ratings_matrix.columns:
        if ratings_matrix.loc[user_id, movie_id] > 0:
            continue
        
        # Get predictions from collaborative filtering methods
        pred_user_based = predict_user_based(user_id, movie_id, user_similarity_df, ratings_matrix)
        pred_item_based = predict_item_based(user_id, movie_id, item_similarity_df, ratings_matrix)
        pred_svd_based = predict_svd(user_id, movie_id, predicted_ratings_svd)
        
        # Get content-based prediction based on content similarity
        pred_content_based = content_similarity_df.loc[movie_id].mean()  # Average similarity of the movie
        
        # Aggregate the predictions with weights
        weighted_prediction = (weights[0] * pred_user_based + weights[1] * pred_item_based + weights[2] * pred_svd_based + weights[3] * pred_content_based) / sum(weights)
        predicted_ratings.append((movie_id, weighted_prediction))
    
    # Sort by weighted prediction and get top N recommendations
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = [data_movies[data_movies['movieId'] == movie_id]['title'].values[0] for movie_id, _ in predicted_ratings[:topN]]
    
    return top_recommendations

In [13]:
user_id = 2  # Example user ID
recommended_movies_hybrid = hybrid_recommendation(user_id, ratings_matrix, user_similarity_df_pearson, item_similarity_df_pearson, predicted_ratings_svd, content_similarity_df, topN=5)

print("Recommended Movies (Hybrid with Content-Based + Collaborative):")
for movie in recommended_movies_hybrid:
    print(movie)

Recommended Movies (Hybrid with Content-Based + Collaborative):
Elite Squad: The Enemy Within (Tropa de Elite 2 - O Inimigo Agora É Outro) (2010)
Deadpool (2016)
Moonrise Kingdom (2012)
John Wick (2014)
Titanic (1997)


In [14]:
user_rated_movies = data_ratings[data_ratings['userId'] == user_id]
print(user_rated_movies.merge(data_movies, on='movieId'))

# Check recommended movies
print(recommended_movies_hybrid)

    userId  movieId  rating   timestamp  \
0        2      318     3.0  1445714835   
1        2      333     4.0  1445715029   
2        2     1704     4.5  1445715228   
3        2     3578     4.0  1445714885   
4        2     6874     4.0  1445714952   
5        2     8798     3.5  1445714960   
6        2    46970     4.0  1445715013   
7        2    48516     4.0  1445715064   
8        2    58559     4.5  1445715141   
9        2    60756     5.0  1445714980   
10       2    68157     4.5  1445715154   
11       2    71535     3.0  1445714974   
12       2    74458     4.0  1445714926   
13       2    77455     3.0  1445714941   
14       2    79132     4.0  1445714841   
15       2    80489     4.5  1445715340   
16       2    80906     5.0  1445715172   
17       2    86345     4.0  1445715166   
18       2    89774     5.0  1445715189   
19       2    91529     3.5  1445714891   
20       2    91658     2.5  1445714938   
21       2    99114     3.5  1445714874   
22       2 

In [17]:
def calculate_mae(predicted_ratings, actual_ratings):
    return np.mean(np.abs(predicted_ratings - actual_ratings))

def calculate_rmse(predicted_ratings, actual_ratings):
    return np.sqrt(np.mean((predicted_ratings - actual_ratings) ** 2))

# Compare predictions to actual ratings
actual_ratings = ratings_matrix.loc[user_id, ratings_matrix.columns]
predicted_ratings = [predict_svd(user_id, movie_id, predicted_ratings_svd) for movie_id in ratings_matrix.columns]
print("MAE:", calculate_mae(predicted_ratings, actual_ratings))
print("RMSE:", calculate_rmse(predicted_ratings, actual_ratings))

MAE: 0.044924724476642273
RMSE: 0.20494891446798322
