In [1]:
# importing libraries
import numpy as np
import pandas as pd
import math
import statistics as stat
from scipy.stats import pearsonr, kendalltau
import matplotlib.pyplot as plt

# Reading the Datasets

In [2]:
# loading datasets
ratings = pd.read_csv(r"./ml-latest-small/ratings.csv")
movies = pd.read_csv(r"./ml-latest-small/movies.csv")

In [3]:
# dropping the irrelevent column(s) from the ratings
ratings = ratings.drop(columns = ["timestamp"])
print(ratings.head(), "\n", movies.head())

   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0 
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


# Finding Similar Users & Recommended movies with Scores

In [4]:
# defining function for creates a User Matrix form userId
def create_userX_matrix(userId):
    userX_matrix = ratings.query('userId == @userId',inplace = False)
    return userX_matrix


# defining function for creating a user subset where one groups by userID which will be taken into consideration 
def user_subset_common_movie(userId):
    userX_matrix = create_userX_matrix(userId)
    users = ratings[ratings['movieId'].isin(userX_matrix['movieId'].tolist())]
    userSubsetGroup = users.groupby(['userId'])
    userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]) and len(x[1]) > 50, reverse=True)
    return userSubsetGroup, userX_matrix

# defining function for calculating similarity via pearson correlation to find out the most similar users
def pearsonCorr(inputMovies, similarUsersGroup):
    pearsonCorrelationDict = {}
    for name, group in similarUsersGroup:
        group = group.sort_values(by='movieId')
        inputMovies1 = inputMovies.sort_values(by='movieId')
        temp_df = inputMovies1[inputMovies1['movieId'].isin(group['movieId'].tolist())]
        selectedUserTempRatingList = temp_df['rating'].tolist()
        similarUserTempRatingList = group['rating'].tolist()

        simXX = 0
        meanSelectedUserTempRating = stat.mean(selectedUserTempRatingList)
        for i in selectedUserTempRatingList:
            simXX = simXX + pow((i - meanSelectedUserTempRating),2)
        simYY = 0
        meanSimilarUserTempRaning = stat.mean(similarUserTempRatingList)
        for j in similarUserTempRatingList:
            simYY = simYY + pow((j - meanSimilarUserTempRaning),2)
        simXY = 0
        for i, j in zip(selectedUserTempRatingList, similarUserTempRatingList):
            simXY = simXY+ ((i - meanSelectedUserTempRating ) * (j-meanSimilarUserTempRaning))

        if simXX != 0 and simYY != 0:
            pearsonCorrelationDict[name] = simXY/np.sqrt(simXX*simYY)
        else:
            pearsonCorrelationDict[name] = 0                    
    
    maxSimilarUser = dict(sorted(pearsonCorrelationDict.items(), key=lambda item: item[1], reverse= True))
    return maxSimilarUser

In [5]:
# defining function for producing Recommendations for the specific User
def produceRec (inputMovies, pearsonCorrelationDict):
    similarityScoreofSimilarUsers_data = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
    similarityScoreofSimilarUsers_data.head()
    similarityScoreofSimilarUsers_data.columns = ['similarityScore']
    similarityScoreofSimilarUsers_data['userId'] = similarityScoreofSimilarUsers_data.index
    similarityScoreofSimilarUsers_data.index = range(len(similarityScoreofSimilarUsers_data))
    topSimilarUsers=similarityScoreofSimilarUsers_data.sort_values(by='similarityScore', ascending=False)
    topSimilarUsersRating = topSimilarUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
    topSimilarUsersRating.head()
    meanRb = topSimilarUsersRating.groupby('userId').mean()[['rating']]
    meanRb.columns = ['avgRating']
    meanRb['userId'] = meanRb.index
    meanRb.index = range(len(meanRb))
    topSimilarUsersRating = topSimilarUsersRating.merge(meanRb, left_on='userId', right_on='userId', how='inner')
    topSimilarUsersRating['weightedRatingScore'] = topSimilarUsersRating['similarityScore']*(topSimilarUsersRating['rating']-topSimilarUsersRating['avgRating'])
    tempTopSimilarUsersRating = topSimilarUsersRating.groupby('movieId').sum()[['weightedRatingScore']]
    tempTopSimilarUsersRating.columns = ['sum_weightedRatingScore']
    tempTopSimilarUsersRating['movieId'] = tempTopSimilarUsersRating.index
    
    recommendation_data = pd.DataFrame()
    meanRa = inputMovies['rating'].mean()
    recommendation_data['weighted average recommendation score'] = meanRa+(tempTopSimilarUsersRating['sum_weightedRatingScore']/topSimilarUsers['similarityScore'].sum())
    recommendation_data['movieId'] = recommendation_data.index
    recommendation_data = recommendation_data.sort_values(by='weighted average recommendation score', ascending=False)
    recommendation_data.index = [x for x in range(1,len(recommendation_data)+1)]
    recommendation_data = recommendation_data.rename(columns=  {'weighted average recommendation score': 'scores'})
    return recommendation_data

In [6]:
# let's take a randon user A (userId = 15)
userA = 15
userSubsetGroupA, userAMatrix = user_subset_common_movie(userA)
pearsonCorrelationDictA = pearsonCorr(userAMatrix, userSubsetGroupA)
recMovieA = produceRec(userAMatrix, pearsonCorrelationDictA)
recMovieA = recMovieA.merge(movies, left_on = "movieId", right_on = "movieId", how="left")
pearsonCorrelationDictA.keys()

dict_keys([15, 92, 138, 162, 188, 257, 278, 281, 394, 485, 501, 544, 545, 568, 529, 218, 333, 566, 473, 192, 384, 44, 90, 74, 134, 130, 283, 345, 358, 422, 271, 250, 583, 423, 446, 32, 120, 353, 108, 117, 467, 342, 179, 410, 170, 337, 11, 181, 284, 455, 121, 297, 546, 102, 598, 13, 450, 191, 94, 602, 276, 436, 54, 371, 592, 579, 412, 323, 39, 513, 470, 404, 48, 314, 176, 373, 155, 225, 106, 440, 82, 290, 421, 591, 38, 267, 385, 183, 17, 58, 491, 477, 604, 173, 386, 244, 321, 565, 519, 555, 304, 235, 81, 541, 262, 310, 507, 285, 508, 178, 540, 157, 234, 57, 409, 472, 109, 4, 476, 366, 411, 133, 486, 379, 226, 372, 93, 27, 609, 350, 402, 233, 464, 25, 339, 33, 208, 368, 510, 480, 349, 294, 572, 300, 514, 40, 454, 554, 302, 107, 374, 418, 222, 593, 47, 195, 115, 99, 559, 475, 378, 312, 419, 597, 552, 608, 326, 255, 512, 396, 599, 84, 414, 511, 96, 217, 408, 590, 503, 303, 420, 469, 287, 351, 272, 119, 76, 184, 65, 190, 551, 201, 204, 193, 365, 95, 242, 465, 433, 164, 185, 588, 140, 376, 5

In [7]:
# now, take another two users User B (userId = 134) & User C (userId = 384)
# which has comperatively higher similarity score against User A
userB = 134
userSubsetGroupB, userBMatrix = user_subset_common_movie(userB)
pearsonCorrelationDictB = pearsonCorr(userBMatrix, userSubsetGroupB)
recMovieB = produceRec(userBMatrix, pearsonCorrelationDictB)
recMovieB = recMovieB.merge(movies, left_on = "movieId", right_on = "movieId", how="left")

userC = 384
userSubsetGroupC, userCMatrix = user_subset_common_movie(userC)
pearsonCorrelationDictC = pearsonCorr(userCMatrix, userSubsetGroupC)
recMovieC = produceRec(userCMatrix, pearsonCorrelationDictC)
recMovieC = recMovieC.merge(movies, left_on = "movieId", right_on = "movieId", how="left")

In [8]:
# findiing out the common movies between User A and User C 
# and checking movies which are not present in the recommendation List (recMovieB) of User B.
mergedInner_UserA_UserC = pd.merge(recMovieA, recMovieC, how="inner", on=['movieId'])
mergedInner_UserA_UserC.head()

Unnamed: 0,scores_x,movieId,title_x,genres_x,scores_y,title_y,genres_y
0,4.25591,318,"Shawshank Redemption, The (1994)",Crime|Drama,3.700839,"Shawshank Redemption, The (1994)",Crime|Drama
1,4.200233,356,Forrest Gump (1994),Comedy|Drama|Romance|War,3.509482,Forrest Gump (1994),Comedy|Drama|Romance|War
2,4.100066,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,3.654379,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
3,4.086499,527,Schindler's List (1993),Drama|War,3.455397,Schindler's List (1993),Drama|War
4,3.943317,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,3.466697,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi


In [9]:
# defining fuction to produce average recommendation for the group (user A, user B, user C)
def produce_rec_average_agg(recMovieA, recMovieB, recMovieC):
    aL = recMovieA['movieId'].tolist()
    bL = recMovieB['movieId'].tolist()
    cL = recMovieC['movieId'].tolist()

    ## Finding out the common movies between UserA, UserB and UserC
    commonRecMov = set(aL) & set(bL) & set(cL)

    ## Fetching the movies from the recommendation List which are found in the common recommendation of the movies
    recA = recMovieA[recMovieA['movieId'].isin(commonRecMov)]
    recB = recMovieB[recMovieB['movieId'].isin(commonRecMov)]
    recC = recMovieC[recMovieC['movieId'].isin(commonRecMov)]

    # Averaging the three user's recommendation with averaging method
    frames = [recA, recB, recC]
    result = pd.concat(frames)
    GrAv = result.groupby(['movieId'], as_index= False).mean().sort_values(by='scores', ascending=False)
    GrAv.index = [x for x in range(1, len(GrAv)+1)]
    groupAv = GrAv.merge(movies, left_on="movieId", right_on ="movieId", how="left")
    return groupAv                         

In [10]:
# showing top 20 reccommended movie via average aggregation formula
groupRec = produce_rec_average_agg(recMovieA, recMovieB, recMovieC)
groupRec.head(20)

Unnamed: 0,movieId,scores,title,genres
0,296,4.042479,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,318,4.024687,"Shawshank Redemption, The (1994)",Crime|Drama
2,356,3.923002,Forrest Gump (1994),Comedy|Drama|Romance|War
3,593,3.914601,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,260,3.888213,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
5,527,3.842689,Schindler's List (1993),Drama|War
6,110,3.835782,Braveheart (1995),Action|Drama|War
7,1196,3.762667,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
8,1210,3.757107,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
9,50,3.724515,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [11]:
# converting the top 20 recommended movies for each user into list
a = set(recMovieA["movieId"].head(20).tolist()) # user A
b = set(recMovieB["movieId"].head(20).tolist()) # user B 
c = set(recMovieC["movieId"].head(20).tolist()) # user C
g = set(groupRec["movieId"].head(20).tolist()) # group recommended

# movies that are present only in user A recommended list but not availbble in group recommended list
x = a - g 
# movies that are present only in user B recommended list but not availbble in group recommended list
y = b - g
# movies that are present only in user C recommended list but not availbble in group recommended list
z = c - g
print("Movies only recommended in User A list: ", x, "\nMovies only recommended in User B list: ", y, "\nMovies only recommended in User C list: ", z)

Movies only recommended in User A list:  {480, 3578, 1270} 
Movies only recommended in User B list:  {1200, 3949, 590, 47} 
Movies only recommended in User C list:  {4226, 3996, 1197}


In [12]:
# defining fuction for returning  all genres & unique genres from the recommended movie list
def get_genres(rec):
    genres = rec["genres"].tolist()
    all_genres = []
    for s in genres:
        seperated = s.split("|")
        for s2 in seperated:
            all_genres.append(s2)
    unique_genres = list(set(all_genres))
    return all_genres, unique_genres

In [13]:
# defining fuction to get the ratio of each unique genres in the recommended mmovie list
def genres_proportion(all_genres, unique_g):
    dict_genres = {}
    n = len(all_genres)
    for g in unique_g:
        b = all_genres.count(g)
        dict_genres[g] = round(b/n, 3)   
    dict_genres = dict(sorted(dict_genres.items(), key=lambda item: item[1], reverse= True)) 
    return dict_genres

In [14]:
# defining function for get the dataframe of each genres with their corresponding weighted score from the recommended movie list
def genres_rank(rec):
    all_genres, unique_genres = get_genres(rec)
    #make dictionary for all genres in the recommender system
    dict_genres = {}
    for g in unique_genres:
        dict_genres[g] = []  
    for l, m in rec.iterrows():
        gList = m["genres"].split("|")
        for gen in gList:
            dict_genres[gen].append(m["movieId"])   
    dict_genres = dict(sorted(dict_genres.items(), key=lambda item: item[1], reverse= True))
    new_dict_genres = {}
    for key, item in dict_genres.items():
        new_dict_genres[key] = []
        for mid in item:
            mid = float(rec[rec["movieId"] == mid]["scores"])
            mid = round(mid, 4)
            new_dict_genres[key].append(mid)      
    for key, item in new_dict_genres.items():
        new_dict_genres[key] = np.average(item) 
    new_dict_genres = dict(sorted(new_dict_genres.items(), key=lambda item: item[1], reverse= True))
    df = pd.DataFrame(list(new_dict_genres.items()),columns = ['genres','average_score'])
    return df

In [15]:
# finding the average score for each genres in the top 20 group recommendation for movies
top20_total_g, unique_g_top20 = get_genres(groupRec.head(20))
dictAgregate = genres_proportion(top20_total_g, unique_g_top20 ) #ignoring the prediction score of the movie
genresRankGR = genres_rank(groupRec) #take into account the prediction score of the movie
genresRankGR #the dataframe of each unique genres with their score

# !it may take some time to generate the result

Unnamed: 0,genres,average_score
0,War,3.379477
1,Film-Noir,3.379034
2,Crime,3.373836
3,Drama,3.372346
4,Mystery,3.372011
5,Western,3.370355
6,Musical,3.370298
7,Documentary,3.370214
8,Animation,3.369878
9,IMAX,3.3696


In [16]:
# finding the average score for each genres in the top 20 movies. for user A
userAGen, uniqueUserAGen = get_genres(recMovieA.head(20))
dictAGen = genres_proportion(userAGen, uniqueUserAGen )
userA_genre_score = genres_rank(recMovieA.head(20))
userA_genre_score

Unnamed: 0,genres,average_score
0,Romance,4.2002
1,War,4.052
2,Drama,3.900844
3,Comedy,3.868875
4,Sci-Fi,3.844686
5,Action,3.8309
6,Crime,3.826414
7,Adventure,3.816386
8,Thriller,3.762271
9,Horror,3.7614


In [17]:
# finding the average score for each genres in the top 20 movies. for user B
userBGen, uniqueUserBGen = get_genres(recMovieB.head(20))
dictBGen = genres_proportion(userBGen, uniqueUserBGen )
userB_genre_score = genres_rank(recMovieB.head(20))
userB_genre_score

Unnamed: 0,genres,average_score
0,Horror,4.1137
1,War,4.0678
2,Romance,4.0593
3,Crime,4.025929
4,Comedy,4.019
5,Drama,4.000409
6,Thriller,3.985229
7,Action,3.938956
8,Western,3.9
9,Sci-Fi,3.89184


In [18]:
# finding the average score for each genres in the top 20 movies. for user C
userCGen, uniqueUserCGen = get_genres(recMovieC.head(20))
dictCGen = genres_proportion(userCGen, uniqueUserCGen )
userC_genre_score = genres_rank(recMovieC.head(20))
userC_genre_score

Unnamed: 0,genres,average_score
0,Horror,3.6344
1,Comedy,3.60115
2,Crime,3.57415
3,Thriller,3.532662
4,Drama,3.530567
5,Sci-Fi,3.49716
6,Adventure,3.4882
7,Action,3.43978
8,War,3.437667
9,Romance,3.425167


# Granularity-Atomic Case (Question & Explanation)
# Q1. Why not there is "Memmento (movieId: 4226)"?

In [19]:
# defining function for explaining Atomic Cases by considering movieId: 4226 & title: Memmento
def atomicCase(mId, recMovieA, recMovieB, recMovieC, groupRec):
    k = 20
    e = []
    #df_k_to_2k = groupRec.iloc[k:2*k]["movieId"].tolist()
    boolValResult = mId in groupRec.movieId
    boolA = mId in recMovieA.movieId
    boolB = mId in recMovieB.movieId
    boolC = mId in recMovieC.movieId
    
    if(boolValResult == True):
        movieName = groupRec[groupRec["movieId"] == mId]["title"].values[0]
        indexValGroup = groupRec[groupRec["movieId"] == mId].index.values[0]
        indexValA = recMovieA[recMovieA["movieId"] == mId].index.values[0]
        indexValB = recMovieB[recMovieB["movieId"] == mId].index.values[0]
        indexValC = recMovieC[recMovieC["movieId"] == mId].index.values[0]
        
        # finding the rank position of the movie in each user's list
        if(indexValGroup in range(k, len(groupRec))):
            e.append(f"The Rank for the Movie {movieName} has a quite low rankings")
            e.append(f"The Reason for the k Value Low because the recommendation made for the {movieName}")
            e.append(f"{movieName} is present in the {indexValA} index position of user A list")
            e.append(f"{movieName} is present in the {indexValB} index position of user B list")
            e.append(f"{movieName} is present in the {indexValC} index position of user B list")
            return e
    else:
        if boolA == True and boolB == True:
            e.append(f'{movieName} is not recommended by User C')
        elif boolB == True and boolC == True:
            e.append(f'{movieName} is not recommended by User A')
        elif boolA == True and boolC == True:
            e.append(f'{movieName} is not recommended by User B')
        elif boolA == True:
            e.append(f'{movieName} is not recommended by User B and User C')
        elif boolB == True:
            e.append(f'{movieName} is not recommended by User B and User C')
        elif boolC == True:
            e.append(f'{movieName} is not recommended by User A and User B')
        else:
            e.append(f"Movie {movieName} does not exist in any user's recommendation list")
            return e
    

In [20]:
# explanation on why there is not "Memmento (movieId: 4226)" in the recommended list
granularAtomic = atomicCase(4226,  recMovieA, recMovieB, recMovieC, groupRec)
granularAtomic

['The Rank for the Movie Memento (2000) has a quite low rankings',
 'The Reason for the k Value Low because the recommendation made for the Memento (2000)',
 'Memento (2000) is present in the 9686 index position of user A list',
 'Memento (2000) is present in the 66 index position of user B list',
 'Memento (2000) is present in the 17 index position of user B list']

# Granularity-Group Case (Question & Explanation)
# Q2. Why not Musical movies?

In [21]:
# defining fuction for explaining group cases by considering genres: "Musical"
def general_genres_rating(rec):
    gen, uniqueGen = get_genres(rec)
    dictGen = genres_proportion(gen, uniqueGen )
    genre_score = genres_rank(rec)
    return genre_score

def top_k(grec, k, *userRecs):
    allRecs = {}
    if k == "all":
        allRecs["group"] = grec
        allRecs["userA"] = userRecs[0]
        allRecs["userB"] = userRecs[1]
        allRecs["userC"] = userRecs[2] 
    else:
        allRecs["group"] = grec.head(k)
        allRecs["userA"] = userRecs[0].head(k)
        allRecs["userB"] = userRecs[1].head(k)
        allRecs["userC"] = userRecs[2].head(k)
    #produce genre's rating for each reccommendation
    listOfGenreAvg = {}
    for key, val in allRecs.items():
        listOfGenreAvg[key] = general_genres_rating(val)
        
    return listOfGenreAvg

def wn_group(genres, grec, *userRecs):
    explanationsList = []
    k = 20
    listOfGenreAvg20 = top_k(grec, k, *userRecs)   
    # check the genre in question existance in each reccommendation
    for key, val in listOfGenreAvg20.items():
        if(genres not in val["genres"].tolist()):
            strExp = f'{genres} is not in {key} top{k} reccomendation '
            explanationsList.append(strExp)
    # check the genre in top 40 of each reccommendation
    k = 40
    listOfGenreAvg40 = top_k(grec, k, *userRecs)
    for key, val in listOfGenreAvg40.items():
        if(genres not in val["genres"].tolist()):
            strExp = f'{genres} is not in {key} top{k} reccomendation '
            explanationsList.append(strExp)
    return explanationsList

In [22]:
# explanation on why not musical movies
granularGroup = wn_group("Musical", groupRec, recMovieA, recMovieB, recMovieC )
granularGroup

['Musical is not in group top20 reccomendation ',
 'Musical is not in userA top20 reccomendation ',
 'Musical is not in userB top20 reccomendation ',
 'Musical is not in userC top20 reccomendation ',
 'Musical is not in group top40 reccomendation ',
 'Musical is not in userA top40 reccomendation ',
 'Musical is not in userC top40 reccomendation ']

# Position Absenteeism (Question & Explanation)
# Q3. Why not rank "Matrix, The (1999)" First ?

In [23]:
# considering movieId: 2571 & title: Matrix, The (1999) for finding position absenteeism
# defining fuction for explaining abtentessism cases by considering movieId: 2571
def avgGenresRanking(m_Id,groupRec,genresRankGR):
    avgScoreGenres = []
    indexOfMovie = groupRec[groupRec["movieId"] == m_Id].values[0]
    genresListed = indexOfMovie[3] #Returning all the genres
    genresSeperateList = genresListed.split("|")
    for score in genresSeperateList:
        tempVal = genresRankGR[genresRankGR["genres"] == score].index.values[0]
        avgScoreGenres.append(tempVal)
    meanAvgGenresScore = np.mean(avgScoreGenres)
    noOfGenresPerMovie = len(avgScoreGenres)
    return meanAvgGenresScore, noOfGenresPerMovie

In [24]:
m_Id = 2571
avgScoreRatingsGenres = []
qtyScore = []
indexOfMId = groupRec[groupRec["movieId"] == m_Id].index.values[0]
for i in range(0, indexOfMId):
    movieIdTemp = groupRec[groupRec.index == i]["movieId"].values[0]
    tempVal1, lenGenres = avgGenresRanking(movieIdTemp,groupRec,genresRankGR)
    avgScoreRatingsGenres.append(tempVal1)
    qtyScore.append(lenGenres)
print("Average of the position of the genres: \n", avgScoreRatingsGenres)
print("\nNumber of genres are present for the top movies:\n", qtyScore)

Average of the position of the genres: 
 [8.25, 2.5, 7.5, 10.666666666666666, 15.0, 1.5, 6.666666666666667, 15.0, 15.0, 6.0, 2.5, 16.0]

Number of genres are present for the top movies:
 [4, 2, 4, 3, 3, 2, 3, 3, 3, 3, 2, 2]
