In [52]:
import pandas as pd
import numpy as np
import math as m
import random as r
from tabulate import tabulate
from scipy.stats import pearsonr
from scipy.stats import spearmanr
links = pd.read_csv('ml-latest-small/links.csv')
links.head(5)
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(5)
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head(5)
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.head(5)
#dropping the timestamp column
ratings = ratings.drop(['timestamp'], axis=1)
#movie and ratings dataset
movie_ratings = pd.merge(ratings, movies, on='movieId')
movie_ratings.head()
#reshaping the data to table based on column values
user_ptable= ratings.pivot(index='userId', columns='movieId', values='rating')
user_ptable.head()
#pearson correlation coefficient
def pearson_correlation(user_a_ratings,user_b_ratings):
    corr,_ = pearsonr(user_a_ratings,user_b_ratings)
    return corr

def user_collaborative_filtering(target_user,p_table,correlationfunction):
    '''
    Gets the most similar users and their correlations to the target user
    Parameters: int target_user -user id in the dataset
                p_table - data as a pivot table
                correlationfunction - the correlation function to be used
    Return: dict similar_users -dictionary of users who have rated similar movies as the target user
    with their ratings.
    '''
    similar_users = {}
    #other users who are not the target user
    for user_b in p_table.index:
        if user_b != target_user:
            # ratings for the target user and user_b
            target_user_ratings = p_table.loc[target_user].dropna()
            user_b_ratings = p_table.loc[user_b].dropna()

            # common rated movies
            common_rated_movies = target_user_ratings.index.intersection(user_b_ratings.index)
            #filter for at least 2  common rated movies
            if len(common_rated_movies) >= 2:
                #filter  ratings to include only common rated movies
                target_user_ratings = target_user_ratings[common_rated_movies]
                user_b_ratings = user_b_ratings[common_rated_movies]
                #check if either contains all the same elements as correlation will be 1 regardless of actual rating
                if len(set(target_user_ratings)) == 1 or len(set(user_b_ratings)) == 1:
                    continue
                similar_users[user_b] = correlationfunction(target_user_ratings,user_b_ratings)
                    
    return similar_users                                                                                                                                                                            

In [53]:
def user_prediction(user_a,item_p,p_table,similarities):
    '''
    Calculates the predicted rating of user `user_a` for item `item_p`.
    Parameters: int user_a - the index of the target user
                int item_p - the index of the unseen movie by target user
                p_table - pivot table of data
                similarities - the dictionary of correlations between target user
                  and other users.
    Return: int prediction - rating of user a for item p
    '''
    user_a_ratings = p_table.loc[user_a]
    mean_usera_ratings = user_a_ratings.mean()
    unseen_item_ratings = p_table.loc[:, item_p].dropna()

    # Get the similarity scores between the target user and other users who have rated the unseen item.
    #relevant_similarities = {}
    predicted_rating = 0
    weighted_difference = 0
    similarity_sum = 0
    for user_b, similarity in similarities.items():
        if user_b != user_a and user_b in unseen_item_ratings.index:
            user_b_ratings = p_table.loc[user_b]
            mean_userb_ratings = user_b_ratings.mean()
            rating_difference = unseen_item_ratings.loc[user_b] - mean_userb_ratings
            weighted_difference += (similarity*rating_difference)
            similarity_sum += abs(similarity)

    if similarity_sum != 0:
        # the prediction as the active user's mean plus the weighted rating differences
        predicted_rating = mean_usera_ratings + (weighted_difference / similarity_sum)
    else:
        predicted_rating = mean_usera_ratings

    return np.clip(predicted_rating,0.5,5)
   
   

In [54]:
def get_user_recommendations(user, p_table, correlation_function, prediction_function):
    similar_users = user_collaborative_filtering(user, p_table, correlation_function)
    sorted_similar_users =  sorted(similar_users.items(), key=lambda item: item[1],reverse=True)
    #sorted_similar_users = sorted(similar_users.items(), key=operator.itemgetter(1), reverse=True)
    top_similar_users = sorted_similar_users[:10]
    top_10_similar_users_dict={}
    for user,similarity in top_similar_users:
        top_10_similar_users_dict[user]=similarity
    user_recommendations = {}
    for movie in p_table.columns:
        if pd.isna(p_table.loc[user, movie]):
            user_recommendations[movie] = prediction_function(user, movie, p_table, top_10_similar_users_dict)
    return user_recommendations

In [55]:
def group_recommendations(user_recommendations_dict, aggregation_method, top_n = 10):
  
    movie_ratings = {}
    #user_recommendations_list = [user_recommendations]
    for user,recommendations in user_recommendations_dict.items():
        for movie, rating in recommendations.items():
            if movie not in movie_ratings:
                movie_ratings[movie] = []
            movie_ratings[movie].append(rating)
    aggregated_ratings = {}   
    if aggregation_method == 'average':
        aggregated_ratings = {movie: np.mean(ratings) for movie, ratings in movie_ratings.items()}

    elif aggregation_method == 'least misery':
        aggregated_ratings = {movie: np.min(ratings) for movie, ratings in movie_ratings.items()}
        
    #group recommendations
    sorted_group_recommendations = sorted(aggregated_ratings.items(), key=lambda item: item[1], reverse=True)
    top_group_recommendations = sorted_group_recommendations[:top_n]

    return top_group_recommendations


In [56]:
group_users = np.random.choice(user_ptable.index, size=3, replace=False)
user_recommendations_dict = {}
for user in group_users:
    user_recommendations_dict [user] = get_user_recommendations(user, user_ptable, pearson_correlation, user_prediction)

average_ratings = group_recommendations(user_recommendations_dict,'average')
least_misery_ratings = group_recommendations(user_recommendations_dict,'least misery')

In [57]:
def create_dataframe(data, movies_df):
    # Assuming movies_df is your DataFrame containing movie information
    movie_titles_dict = movies_df.set_index('movieId')['title'].to_dict()

    # Extract movie IDs, ratings, and titles from the list of tuples
    movie_ids, ratings = zip(*data)
    movie_titles = [movie_titles_dict.get(movie_id, 'Unknown') for movie_id in movie_ids]

    movie_df = pd.DataFrame({
        'Movie ID': movie_ids,
        'Title': movie_titles,
        'Rating': ratings
    })

    return movie_df

In [58]:
average_recommendations = create_dataframe(average_ratings,movies)
print(average_recommendations)

   Movie ID                                              Title    Rating
0       318                   Shawshank Redemption, The (1994)  4.902840
1       590                          Dances with Wolves (1990)  4.707161
2       356                                Forrest Gump (1994)  4.511185
3      3578                                   Gladiator (2000)  4.409091
4      3996  Crouching Tiger, Hidden Dragon (Wo hu cang lon...  4.397059
5      1272                                      Patton (1970)  4.353177
6      6502                               28 Days Later (2002)  4.341323
7       541                                Blade Runner (1982)  4.339087
8      1204                          Lawrence of Arabia (1962)  4.328664
9       339                     While You Were Sleeping (1995)  4.327889


In [59]:
least_misery_recommendations = create_dataframe(least_misery_ratings,movies)
print(least_misery_recommendations)

   Movie ID                             Title    Rating
0       318  Shawshank Redemption, The (1994)  4.805680
1       590         Dances with Wolves (1990)  4.707161
2      6502              28 Days Later (2002)  4.258403
3       150                  Apollo 13 (1995)  4.188371
4       593  Silence of the Lambs, The (1991)  4.173669
5      1961                   Rain Man (1988)  4.141944
6      1212             Third Man, The (1949)  4.111191
7      1214                      Alien (1979)  4.011924
8       339    While You Were Sleeping (1995)  3.948617
9      3363          American Graffiti (1973)  3.860535
