In [1]:
import pandas as pd
import numpy as np
import math as m
import random as r
from tabulate import tabulate
from scipy.stats import pearsonr
from scipy.stats import spearmanr
links = pd.read_csv('ml-latest-small/links.csv')
links.head(5)
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(5)
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head(5)
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.head(5)
#dropping the timestamp column
ratings = ratings.drop(['timestamp'], axis=1)
#movie and ratings dataset
movie_ratings = pd.merge(ratings, movies, on='movieId')
movie_ratings.head()
#reshaping the data to table based on column values
user_ptable= ratings.pivot(index='userId', columns='movieId', values='rating')
user_ptable.head()

# Convert user IDs to integers
#user_ptable.index = user_ptable.index.astype(int)

# Reset the index of user_ptable
#user_ptable.reset_index(drop=True, inplace=True)



#pearson correlation coefficient
def pearson_correlation(user_a_ratings,user_b_ratings, user_ptable):
    corr,_ = pearsonr(user_a_ratings,user_b_ratings)
    return corr

def user_collaborative_filtering(target_user,user_ptable,correlationfunction):
    '''
    Gets the most similar users and their correlations to the target user
    Parameters: int target_user -user id in the dataset
                user_ptable - data as a pivot table
                correlationfunction - the correlation function to be used
    Return: dict similar_users -dictionary of users who have rated similar movies as the target user
    with their ratings.
    '''
    similar_users = {}
    #other users who are not the target user
    for user_b in user_ptable.index:
        if user_b != target_user:
            # ratings for the target user and user_b
            target_user_ratings = user_ptable.loc[target_user].dropna()
            user_b_ratings = user_ptable.loc[user_b].dropna()

            # common rated movies
            common_rated_movies = target_user_ratings.index.intersection(user_b_ratings.index)
            #filter for at least 2  common rated movies
            if len(common_rated_movies) >= 2:
                #filter  ratings to include only common rated movies
                target_user_ratings = target_user_ratings[common_rated_movies]
                user_b_ratings = user_b_ratings[common_rated_movies]
                #check if either contains all the same elements as correlation will be 1 regardless of actual rating
                if len(set(target_user_ratings)) == 1 or len(set(user_b_ratings)) == 1:
                    continue
                similar_users[user_b] = correlationfunction(target_user_ratings,user_b_ratings,user_ptable)
                    
    return similar_users      


In [2]:
def user_prediction(user_a,item_p,user_ptable,similarities):
    '''
    Calculates the predicted rating of user `user_a` for item `item_p`.
    Parameters: int user_a - the index of the target user
                int item_p - the index of the unseen movie by target user
                user_ptable - pivot table of data
                similarities - the dictionary of correlations between target user
                  and other users.
    Return: int prediction - rating of user a for item p
    '''
    user_a_ratings = user_ptable.loc[user_a]
    mean_usera_ratings = user_a_ratings.mean()
    unseen_item_ratings = user_ptable.loc[:, item_p].dropna()

    # Get the similarity scores between the target user and other users who have rated the unseen item.
    #relevant_similarities = {}
    predicted_rating = 0
    weighted_difference = 0
    similarity_sum = 0
    for user_b, similarity in similarities.items():
        if user_b != user_a and user_b in unseen_item_ratings.index:
            user_b_ratings = user_ptable.loc[user_b]
            mean_userb_ratings = user_b_ratings.mean()
            rating_difference = unseen_item_ratings.loc[user_b] - mean_userb_ratings
            weighted_difference += (similarity*rating_difference)
            similarity_sum += abs(similarity)

    if similarity_sum != 0:
        # the prediction as the active user's mean plus the weighted rating differences
        predicted_rating = mean_usera_ratings + (weighted_difference / similarity_sum)
    else:
        predicted_rating = mean_usera_ratings

    return np.clip(predicted_rating,0.5,5)
   
   

In [3]:
def get_user_recommendations(user, user_ptable, correlationfunction, prediction_function):
    similar_users = user_collaborative_filtering(user, user_ptable, correlationfunction)
    sorted_similar_users =  sorted(similar_users.items(), key=lambda item: item[1],reverse=True)
    #sorted_similar_users = sorted(similar_users.items(), key=operator.itemgetter(1), reverse=True)
    top_similar_users = sorted_similar_users[:10]
    top_10_similar_users_dict={}
    for user,similarity in top_similar_users:
        top_10_similar_users_dict[user]=similarity
    user_recommendations = {}
    for movie in user_ptable.columns:
        if pd.isna(user_ptable.loc[user, movie]):
            user_recommendations[movie] = prediction_function(user, movie, user_ptable, top_10_similar_users_dict)
    return user_recommendations

In [4]:
def group_recommendations(user_recommendations_dict, aggregation_method, top_n = 10):
  
    movie_ratings = {}
    #user_recommendations_list = [user_recommendations]
    for user,recommendations in user_recommendations_dict.items():
        for movie, rating in recommendations.items():
            if movie not in movie_ratings:
                movie_ratings[movie] = []
            movie_ratings[movie].append(rating)
    aggregated_ratings = {}   
    if aggregation_method == 'average':
        aggregated_ratings = {movie: np.mean(ratings) for movie, ratings in movie_ratings.items()}

    elif aggregation_method == 'least misery':
        aggregated_ratings = {movie: np.min(ratings) for movie, ratings in movie_ratings.items()}
        
    #group recommendations
    sorted_group_recommendations = sorted(aggregated_ratings.items(), key=lambda item: item[1], reverse=True)
    top_group_recommendations = sorted_group_recommendations[:top_n]

    return top_group_recommendations


In [5]:
group_users = np.random.choice(user_ptable.index, size=3, replace=False)
user_recommendations_dict = {}
for user in group_users:
    user_recommendations_dict [user] = get_user_recommendations(user, user_ptable, pearson_correlation, user_prediction)

average_ratings = group_recommendations(user_recommendations_dict,'average')
least_misery_ratings = group_recommendations(user_recommendations_dict,'least misery')

In [6]:
def create_dataframe(data, movies_df):
    # Assuming movies_df is your DataFrame containing movie information
    movie_titles_dict = movies_df.set_index('movieId')['title'].to_dict()

    # Extract movie IDs, ratings, and titles from the list of tuples
    movie_ids, ratings = zip(*data)
    movie_titles = [movie_titles_dict.get(movie_id, 'Unknown') for movie_id in movie_ids]

    movie_df = pd.DataFrame({
        'Movie ID': movie_ids,
        'Title': movie_titles,
        'Rating': ratings
    })

    return movie_df

In [7]:
average_recommendations = create_dataframe(average_ratings,movies)
print(average_recommendations)

   Movie ID                                              Title    Rating
0      4886                              Monsters, Inc. (2001)  4.838680
1       595                        Beauty and the Beast (1991)  4.798325
2       750  Dr. Strangelove or: How I Learned to Stop Worr...  4.752233
3      5816     Harry Potter and the Chamber of Secrets (2002)  4.718750
4     50872                                 Ratatouille (2007)  4.666667
5      1136             Monty Python and the Holy Grail (1975)  4.659758
6      1028                                Mary Poppins (1964)  4.648200
7       457                               Fugitive, The (1993)  4.638520
8      4034                                     Traffic (2000)  4.625498
9        34                                        Babe (1995)  4.585661


In [8]:
least_misery_recommendations = create_dataframe(least_misery_ratings,movies)
print(least_misery_recommendations)

   Movie ID                                              Title    Rating
0       595                        Beauty and the Beast (1991)  4.596649
1       750  Dr. Strangelove or: How I Learned to Stop Worr...  4.537234
2      4886                              Monsters, Inc. (2001)  4.516039
3       110                                  Braveheart (1995)  4.500025
4       356                                Forrest Gump (1994)  4.461171
5      5816     Harry Potter and the Chamber of Secrets (2002)  4.437500
6       260          Star Wars: Episode IV - A New Hope (1977)  4.311251
7      1210  Star Wars: Episode VI - Return of the Jedi (1983)  4.301601
8       457                               Fugitive, The (1993)  4.277041
9       541                                Blade Runner (1982)  4.260000


Part b


Counting disagreements between users in a group involves measuring how different their preferences are. In the context of collaborative filtering for recommendation systems, users' preferences are often represented as vectors in a high-dimensional space, where each dimension corresponds to a different movie. A way is to use a similarity metric, such as cosine similarity. Cosine similarity measures the cosine of the angle between two vectors, providing a value between -1 and 1. A higher cosine similarity indicates more similar preferences, while a lower cosine similarity suggests greater dissimilarity. To quantify disagreements, we can use the complement of this similarity, which gives us a measure of dissimilarity. This value can be interpreted as a disagreement score between users.

Cosine Similarity:

Given two users, each represented by their preference vector (ratings for items), calculate the cosine similarity.
A cosine similarity of 1 indicates identical preferences, 0 indicates orthogonal (uncorrelated) preferences, and -1 indicates opposite preferences.

Disagreement Score:

To quantify disagreements, use the complement of the cosine similarity as a disagreement score.
Disagreement Score = 1 - Cosine Similarity
This score ranges from 0 (indicating no disagreement) to 2 (indicating maximum disagreement).
Method for Computing Group Suggestions with Disagreements:

When computing suggestions for a group, consider the disagreement scores between all pairs of users in the group.
Adjust the predicted ratings for each item based on the disagreement scores.
The idea is to penalize predictions more when users have higher disagreements.

Adjusting Predicted Ratings:

The adjustment aims to provide more personalized and relevant recommendations by considering the diversity of preferences within the group
Adjust the predicted ratings based on the disagreement scores between users. The idea is to penalize predictions in situations where users have high disagreements. This can be done by multiplying the predicted ratings by the disagreement scores.
For each item:
If users have higher disagreement scores (indicating greater dissimilarity), you decrease the predicted rating for that item.
If users have lower disagreement scores (indicating more similarity), you may give more weight to the predicted rating.

In [13]:
def cosine_similarity(user1, user2, user_ptable):
    common_movies = user_ptable.columns[pd.notna(user_ptable.loc[user1]) & pd.notna(user_ptable.loc[user2])]
    if len(common_movies) < 2:
        return 0.0
    vector1 = user_ptable.loc[user1, common_movies]
    vector2 = user_ptable.loc[user2, common_movies]
    similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
    return similarity


In [14]:
def get_user_recommendations_with_disagreement(user, user_ptable, correlationfunction, prediction_function):

    
    similar_users = user_collaborative_filtering(user, user_ptable, correlationfunction)
    sorted_similar_users = sorted(similar_users.items(), key=lambda item: item[1], reverse=True)
    top_similar_users = sorted_similar_users[:10]
    top_10_similar_users_dict = {user: similarity for user, similarity in top_similar_users}

    user_recommendations = {}
    for movie in user_ptable.columns:
        if pd.isna(user_ptable.loc[user, movie]):
            user_recommendations[movie] = prediction_function(user, movie, user_ptable, top_10_similar_users_dict)
    return user_recommendations

In [15]:
def group_recommendations_with_disagreement(user_recommendations_dict, cosine_similarity, user_ptable, top_n=10):
    movie_ratings = {}
    for user, recommendations in user_recommendations_dict.items():
        for movie, rating in recommendations.items():
            if movie not in movie_ratings:
                movie_ratings[movie] = []
            movie_ratings[movie].append(rating)

    aggregated_ratings = {movie: np.mean(ratings) for movie, ratings in movie_ratings.items()}


    # Consider disagreements using cosine similarity
    for user1 in user_recommendations_dict:
        for user2 in user_recommendations_dict:
            if user1 != user2:
                disagreement_score = 1 - cosine_similarity(user1, user2, user_ptable)
                for movie in user_ptable.columns:
                    if pd.isna(user_ptable.loc[user1, movie]) and pd.isna(user_ptable.loc[user2, movie]):
                        # Adjust the aggregated rating based on the disagreement score
                        aggregated_ratings[movie] *= disagreement_score

    sorted_group_recommendations = sorted(aggregated_ratings.items(), key=lambda item: item[1], reverse=True)
    top_group_recommendations = sorted_group_recommendations[:top_n]

    return top_group_recommendations
    

In [16]:
user_recommendations_dict = {}
for user in group_users:
    user_recommendations_dict[user] = get_user_recommendations_with_disagreement(user, user_ptable, cosine_similarity, user_prediction)

# Assume you have a DataFrame named user_ptable representing the user-item matrix
disagreement_aware_ratings = group_recommendations_with_disagreement(user_recommendations_dict, cosine_similarity, user_ptable)
disagreement_aware_recommendations = create_dataframe(disagreement_aware_ratings, movies)
print(disagreement_aware_recommendations)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed