In [23]:
import pandas as pd
import numpy as np
import math as m
import random as r
from tabulate import tabulate
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import itertools

links = pd.read_csv('ml-latest-small/links.csv')
links.head(5)
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(5)
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head(5)
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.head(5)
#dropping the timestamp column
ratings = ratings.drop(['timestamp'], axis=1)
#movie and ratings dataset
movie_ratings = pd.merge(ratings, movies, on='movieId')
movie_ratings.head()
#reshaping the data to table based on column values
user_ptable= ratings.pivot(index='userId', columns='movieId', values='rating')
user_ptable.head()


#pearson correlation coefficient
def pearson_correlation(user_a_ratings,user_b_ratings):
    corr,_ = pearsonr(user_a_ratings,user_b_ratings)
    return corr

def user_collaborative_filtering(target_user,p_table,correlationfunction):
    '''
    Gets the most similar users and their correlations to the target user
    Parameters: int target_user -user id in the dataset
                p_table - data as a pivot table
                correlationfunction - the correlation function to be used
    Return: dict similar_users -dictionary of users who have rated similar movies as the target user
    with their ratings.
    '''
    similar_users = {}
    #other users who are not the target user
    for user_b in p_table.index:
        if user_b != target_user:
            # ratings for the target user and user_b
            target_user_ratings = p_table.loc[target_user].dropna()
            user_b_ratings = p_table.loc[user_b].dropna()

            # common rated movies
            common_rated_movies = target_user_ratings.index.intersection(user_b_ratings.index)
            #filter for at least 2  common rated movies
            if len(common_rated_movies) >= 2:
                #filter  ratings to include only common rated movies
                target_user_ratings = target_user_ratings[common_rated_movies]
                user_b_ratings = user_b_ratings[common_rated_movies]
                #check if either contains all the same elements as correlation will be 1 regardless of actual rating
                if len(set(target_user_ratings)) == 1 or len(set(user_b_ratings)) == 1:
                    continue
                similar_users[user_b] = correlationfunction(target_user_ratings,user_b_ratings)
                    
    return similar_users
     
def user_prediction(user_a,item_p,p_table,similarities):
    '''
    Calculates the predicted rating of user `user_a` for item `item_p`.
    Parameters: int user_a - the index of the target user
                int item_p - the index of the unseen movie by target user
                p_table - pivot table of data
                similarities - the dictionary of correlations between target user
                  and other users.
    Return: int prediction - rating of user a for item p
    '''
    user_a_ratings = p_table.loc[user_a]
    mean_usera_ratings = user_a_ratings.mean()
    unseen_item_ratings = p_table.loc[:, item_p].dropna()

    # Get the similarity scores between the target user and other users who have rated the unseen item.
    #relevant_similarities = {}
    predicted_rating = 0
    weighted_difference = 0
    similarity_sum = 0
    for user_b, similarity in similarities.items():
        if user_b != user_a and user_b in unseen_item_ratings.index:
            user_b_ratings = p_table.loc[user_b]
            mean_userb_ratings = user_b_ratings.mean()
            rating_difference = unseen_item_ratings.loc[user_b] - mean_userb_ratings
            weighted_difference += (similarity*rating_difference)
            similarity_sum += abs(similarity)

    if similarity_sum != 0:
        # the prediction as the active user's mean plus the weighted rating differences
        predicted_rating = mean_usera_ratings + (weighted_difference / similarity_sum)
    else:
        predicted_rating = mean_usera_ratings

    return np.clip(predicted_rating,0.5,5)

def get_user_recommendations(user, p_table, correlation_function, prediction_function,top_n = 10):
    '''Function gets the user _recommendations for a particular user using the prediction function
    Returns a dictionary of the movie (key) and the predicted rating(value).
    p table in this case is the data in the said iteration
    '''
    similar_users = user_collaborative_filtering(user, p_table, correlation_function)
    sorted_similar_users =  sorted(similar_users.items(), key=lambda item: item[1],reverse=True)
    #sorted_similar_users = sorted(similar_users.items(), key=operator.itemgetter(1), reverse=True)
    top_similar_users = sorted_similar_users[:top_n]
    top_10_similar_users_dict={}
    for user,similarity in top_similar_users:
        top_10_similar_users_dict[user]=similarity
    user_recommendations = {}
    for movie in p_table.columns:
        if pd.isna(p_table.loc[user, movie]):
            user_recommendations[movie] = prediction_function(user, movie, p_table, top_10_similar_users_dict)
    sorted_user_recommendations =  sorted(user_recommendations.items(), key=lambda item: item[1],reverse=True)
    top_10_user_recommendations = sorted_user_recommendations[:top_n]
    
    return user_recommendations

def group_recommendations(user_recommendations_dict, aggregation_method, top_n = 10):
    '''Function calculates the group_recommendation based on the given aggregation method.
    The aggregation methods are average and least misery method.
    Returns :list(tuple) of the movies and predicted rating based on selected method'''
  
    movie_ratings = {}
    #user_recommendations_list = [user_recommendations]
    for user,recommendations in user_recommendations_dict.items():
        for movie, rating in recommendations.items():
            if movie not in movie_ratings:
                movie_ratings[movie] = []
            movie_ratings[movie].append(rating)
    aggregated_ratings = {}   
    if aggregation_method == 'average':
        aggregated_ratings = {movie: np.mean(ratings) for movie, ratings in movie_ratings.items()}

    elif aggregation_method == 'least misery':
        aggregated_ratings = {movie: np.min(ratings) for movie, ratings in movie_ratings.items()}
        
        
    #group recommendations
    sorted_group_recommendations = sorted(aggregated_ratings.items(), key=lambda item: item[1], reverse=True)
    top_group_recommendations = sorted_group_recommendations[:top_n]
    

    return top_group_recommendations

def create_dataframe(data, movies_df):
    '''Creates a data frame...
     Takes a list(tuple) as parameter and a data frame
     Returns: data frame
     '''
    # DataFrame containing movie information
    movie_titles_dict = movies_df.set_index('movieId')['title'].to_dict()

    # Extract movie IDs, ratings, and titles from the list of tuples
    movie_ids, ratings = zip(*data)
    movie_titles = [movie_titles_dict.get(movie_id, 'Unknown') for movie_id in movie_ids]

    movie_df = pd.DataFrame({
        'Movie ID': movie_ids,
        'Title': movie_titles,
        'Rating': ratings
    })

    return movie_df

def select_users():
    userIds = ratings.userId.unique()
    print(f"Dataset UserIds: {min(userIds)}-{max(userIds)}")
    group_users = [int(x) for x in input("Enter User Ids seperated by space: ").split(" ")]

    print("Getting group recommendations...")
    user_top_preferences = {uid: get_user_recommendations(uid, user_ptable, pearson_correlation, user_prediction,top_n = None) for uid in group_users}
    group_top_preferences = group_recommendations(user_top_preferences, 'average', top_n = None)
    groups_recommendation = create_dataframe(group_top_preferences,movies)
    top_10 = groups_recommendation[:10]

    print(f"Top 10 Recommendations for Users {group_users}:")
    
    
    print(top_10)

    return user_top_preferences, groups_recommendation #group_top_preferences

                                                                                                                                                                             

In [3]:
user_top_preferences, groups_recommendation = select_users()


Dataset UserIds: 1-610
Making Predictions, may take a few minutes...
Top 10 Recommendations for Users [1, 2, 3]:
   Movie ID                                           Title  Rating
0       129                           Pie in the Sky (1996)     5.0
1      4863                           Female Trouble (1975)     5.0
2     25947                       Unfaithfully Yours (1948)     5.0
3     46865                               Little Man (2006)     5.0
4     52279                         Are We Done Yet? (2007)     5.0
5     70946                                  Troll 2 (1990)     5.0
6     72696                                 Old Dogs (2009)     5.0
7     73042  Alvin and the Chipmunks: The Squeakquel (2009)     5.0
8     91671     Alvin and the Chipmunks: Chipwrecked (2011)     5.0
9     93139               Mega Shark vs. Crocosaurus (2010)     5.0


In [13]:
user_top_preferences

{1: {1: 4.000517308651994,
  2: 3.6847616199996134,
  3: 3.62922593499792,
  4: 3.265030780986958,
  5: 3.6385740079717976,
  6: 4.055572518875397,
  7: 3.767522336088764,
  8: 3.50322149228407,
  9: 3.251143720014859,
  10: 3.762940042069131,
  11: 3.9147613260556455,
  12: 3.5279787250748553,
  13: 3.757418913907033,
  14: 4.430825634897643,
  15: 4.128796994520881,
  16: 4.100917437658739,
  17: 4.145058214407336,
  18: 3.718797944916132,
  19: 3.6863178460036066,
  20: 4.144440512798778,
  21: 3.9365555923107656,
  22: 3.5983386827892483,
  23: 3.5690781010675847,
  24: 3.810960044573679,
  25: 4.064704369792487,
  26: 3.8633328042476256,
  27: 3.5387458075960274,
  28: 4.305292777985403,
  30: 4.74196699670242,
  31: 3.893631696326978,
  32: 4.17459901961741,
  34: 3.9485977140424104,
  36: 4.007116221432249,
  38: 3.3344041865418035,
  39: 3.6443297286746237,
  40: 3.524577689709935,
  41: 4.197695358697999,
  42: 3.345036051857035,
  43: 3.8041947097318376,
  44: 3.5323659994063

In [14]:
96	1236	Trust (1990)	4.312483

SyntaxError: invalid syntax (1840643334.py, line 1)

In [None]:
groups_recommendation[:50]

Unnamed: 0,Movie ID,Title,Rating
0,129,Pie in the Sky (1996),5.0
1,4863,Female Trouble (1975),5.0
2,25947,Unfaithfully Yours (1948),5.0
3,46865,Little Man (2006),5.0
4,52279,Are We Done Yet? (2007),5.0
5,70946,Troll 2 (1990),5.0
6,72696,Old Dogs (2009),5.0
7,73042,Alvin and the Chipmunks: The Squeakquel (2009),5.0
8,91671,Alvin and the Chipmunks: Chipwrecked (2011),5.0
9,93139,Mega Shark vs. Crocosaurus (2010),5.0


In [None]:
user_preferences = pd.DataFrame(user_top_preferences).T.fillna(0)
user_preferences



Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1378,1653,1748,1960,2359,2707,3174,3363,3386,3471
1,4.000517,3.684762,3.629226,3.265031,3.638574,4.055573,3.767522,3.503221,3.251144,3.76294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,3.622591,3.375701,2.016472,3.481769,0.0,3.000386,3.720904,4.759033,3.842991,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.931656,3.311053,2.89704,3.432619,3.78459,3.254855,3.234695,3.162706,2.992756,3.121538,...,3.054118,2.887768,2.903644,2.991524,2.700291,3.519512,2.748822,3.214938,2.838998,2.924468


In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [None]:
movie_name = movies.loc[movies['movieId'] == 1310, 'title'].values
movie_name

array(['Hype! (1996)'], dtype=object)

In [None]:
movie_id = 3
user_preferences = pd.DataFrame(user_top_preferences).T.fillna(0)
if movie_id not in user_preferences.columns:
        print(f"Because the movie {movie_id} was not recommended to any of the users!")
else:
    print('n')

n


In [None]:
group_recommendation_genres = pd.merge(groups_recommendation, movies, left_on='Movie ID', right_on='movieId')
           
#get all genres
unique_genres = movies['genres'].str.split('|').explode().unique()
recommended_genres = group_recommendation_genres[:10]['genres'].str.split('|').explode().unique()
recommended_genres

array(['Comedy', 'Romance', 'Crime', 'Fantasy', 'Horror', 'Animation',
       'Children', 'Musical', 'Action', 'Adventure'], dtype=object)

In [19]:
requested_movie_rating = groups_recommendation[groups_recommendation['Title'] == 'Toy Story (1995)']['Rating'].iloc[0]
recommended_nth_movie = groups_recommendation.iloc[5 - 1]['Title']
recommended_nth_movie_rating = groups_recommendation.iloc[5- 1]['Rating']

In [22]:
if recommended_nth_movie_rating >= requested_movie_rating:
    how_much_is_better = (recommended_nth_movie_rating - requested_movie_rating) / requested_movie_rating
    print(f"Because our th suggestion, {recommended_nth_movie}, is "
            f"{(100 * how_much_is_better):.2f}% better than.")
else:
    how_much_is_better = (requested_movie_rating - recommended_nth_movie_rating) / recommended_nth_movie_rating
    print(f"Because rating of nth  is {requested_movie_rating:.2f} and it is "
            f"{(100 * how_much_is_better):.2f}% better than {recommended_nth_movie} with rating "
            f"{recommended_nth_movie_rating:.2f}.")

Because our th suggestion, Are We Done Yet? (2007), is 44.25% better than.


In [20]:
requested_movie_rating

3.466086779327387

In [24]:
def why_not_questions(user_top_preferences=None, groups_recommendation=None):
    
    while True:
        if user_top_preferences is None or groups_recommendation is None:
            user_top_preferences, groups_recommendation= select_users()

        top_10 = groups_recommendation[:10]

        print("\n \n ------ ")
        print("Select Question")
        print("1. Why Not a specific movie?")
        print("2. Why Not a specific genre?")
        print("3. Why Not a specific movie is not n-th?")
        print("4. Quit")
        user_input = input()
        print(f'Your choice: {user_input}')

        if user_input == "1":

            movie_name = input("Enter the name of the movie: ")

            # check if it is not included in dataset
            if movies[movies.title == movie_name].empty:
                print(f"Because the movie {movie_name} is not included in our dataset!")
                continue

            index = groups_recommendation.index[groups_recommendation['Title'] ==movie_name].tolist()

            # Get the rating value of the specified title
            movie_rating = groups_recommendation.loc[index[0], 'Rating']
            movie_rank = index[0]+1

            # check if it has low rank in group recommendations
            if movie_rank <= 10:
                print(f"The movie is already in the top 10 recommendations!")
                continue

            if movie_rank > 10 and movie_rank < 20:
                print(f"Because the movie {movie_name} is ranked as {movie_rank}th item in the recommendations "
                      f"with a rating {movie_rating:.2f}!")
                continue

            # check if it is in the top preferences of the users 
            filtered_movies = movies[movies['title'] == movie_name]
            movie_id = filtered_movies.iloc[0]['movieId']#id of the movie

            #top 50 recommended movies to the users
            top_100_movies = []
            for user_ratings in user_top_preferences.values():
                sorted_movies = sorted(user_ratings.items(), key=lambda x: x[1], reverse=True)
                top_100_movies.extend([movie_id for movie_id, _ in sorted_movies[:100]])

            if movie_id not in top_100_movies:
               print(f"Because the movie {movie_name} is not a top recommendation to any of the users")
               continue

            # if in the top 100 recommendations its rating is compared to check why not in top 10
            recommended_max_rate = top_10['Rating'].max()
            recommended_min_rate = top_10.rating.min()
            print(f"Because movie {movie_name} has a low rating: {movie_rating:.2f}, "
                  f"but rating range of  top recommended movies is between "
                  f"[{recommended_min_rate:.2f}, {recommended_max_rate:.2f}].")
            continue

        if user_input == "2":
            group_recommendation_genres = pd.merge(groups_recommendation, movies, left_on='Movie ID', right_on='movieId')
           
            #get all genres
            unique_genres = movies['genres'].str.split('|').explode().unique()
            recommended_genres = group_recommendation_genres[:10]['genres'].str.split('|').explode().unique()
            genre_name = input("Enter Movie Genre: ")

            # check if it is not a genre in the dataset
            if genre_name not in unique_genres:
                print(f"Because there is no movie of the genre {genre_name} in our movie database yet!")
                continue

            # check if no movie of the said genre in recommendations
            if genre_name in recommended_genres:
                print(f"There is a movie of the genre {genre_name} in the recommended movies!")
                continue

            # filter movies of the interest genre
            total_movies = group_recommendation_genres['Movie ID'].nunique()
            genre_movies_count = group_recommendation_genres[group_recommendation_genres['genres'].str.contains(genre_name)]['Movie ID'].nunique()

            # calculate the percentage interest for the interest_genre
            percentage_interest = (genre_movies_count / total_movies) * 100

            print(f"Because interest rate of the group for the {genre_name} genre is {percentage_interest:.2f}%")

            continue

        if user_input == "3":
            movie_name = input("Enter Movie Name: ")
            # check if requested rank is wrong
            movie_index= int(input(f"Why {movie_name}is not in which rank? "))

            #check if not in top 10 recommendations
            if movie_name not in top_10['Title'].values:
                print(f"The movie {movie_name} is not included in the top 10 recommended movies!")
                continue

            # check if requested rank is wrong
            if movie_index > 10:
                print(f"Wrong! our recommendation list size is 10 and you requested for {movie_index}th item!")
             
            requested_movie_rating = groups_recommendation[groups_recommendation['Title'] == movie_name]['Rating'].iloc[0]
            recommended_nth_movie = groups_recommendation.iloc[movie_index - 1]['Title']
            recommended_nth_movie_rating = groups_recommendation.iloc[movie_index - 1]['Rating']

            if recommended_nth_movie_rating >= requested_movie_rating:
                percentage_difference = ((recommended_nth_movie_rating - requested_movie_rating) / requested_movie_rating)*100
                print(f"Because the {movie_index}th recommended movie , '{recommended_nth_movie}', has a higher rating "
                      f" than '{movie_name}' by {percentage_difference:.2f}%.")
            else:
                percentage_difference = ((requested_movie_rating - recommended_nth_movie_rating) / recommended_nth_movie_rating)*100
                print(f"Because the {movie_index}th movie suggestion, '{recommended_nth_movie}' is rated lower "
                      f"than {movie_name} by by {percentage_difference:.2f}%. ")
            continue
     
        if user_input == "4":
            return


In [25]:
why_not_questions()

Dataset UserIds: 1-610
Getting group recommendations...
Top 10 Recommendations for Users [1, 2, 3]:
   Movie ID                                           Title  Rating
0       129                           Pie in the Sky (1996)     5.0
1      4863                           Female Trouble (1975)     5.0
2     25947                       Unfaithfully Yours (1948)     5.0
3     46865                               Little Man (2006)     5.0
4     52279                         Are We Done Yet? (2007)     5.0
5     70946                                  Troll 2 (1990)     5.0
6     72696                                 Old Dogs (2009)     5.0
7     73042  Alvin and the Chipmunks: The Squeakquel (2009)     5.0
8     91671     Alvin and the Chipmunks: Chipwrecked (2011)     5.0
9     93139               Mega Shark vs. Crocosaurus (2010)     5.0

 
 ------ 
Select Question
1. Why Not a specific movie?
2. Why Not a specific genre?
3. Why Not a specific movie is not n-th?
4. Quit
Your choice: 1
Be