In [1]:
#
# ESADE - Master in Business Analytics and Big Data
# Recommendation Systems
# Building a Non Personalized Recommendation Engine
#
# Nensi Hakobjanyan
#

In [2]:
# imports the libraries
import csv
import numpy as np
import pandas as pd

In [4]:
# builds the critics data set
input_file = pd.read_csv("../../DataSets/movieratings.csv")

In [5]:
# Algorithms

# Mean Rating: Calculate the mean rating for each movie, order with the 
# highest rating listed first, and submit the top n.
#
# Output example:
# [ (4.2000000000000001, '222: The Matrix'), 
#   (3.5999999966666622, '233: Alien'), 
#   (3.3343434322222223, '3434: The Silence of the Lambs'), 
#   (3.2222222222222222, '111: The Water Divider'), 
#   (3.0022332223330625, '333: , Robocop')]
def topMean(prefs,n=5):
    #not much to comment, just calculating the mean and sorting the values from highest
    scores=prefs.mean().sort_values(ascending=False)
    return scores[0:n]

In [6]:
topMean(input_file, n=5)

318: Shawshank Redemption, The (1994)             3.600000
260: Star Wars: Episode IV - A New Hope (1977)    3.266667
541: Blade Runner (1982)                          3.222222
1265: Groundhog Day (1993)                        3.166667
593: Silence of the Lambs, The (1991)             3.062500
dtype: float64

In [9]:
# % of ratings r+: Calculate the percentage of ratings for each movie 
# that are r or higher. Order with the highest percentage first, and submit the top n.

def topPerc(prefs,r=3,n=5):
    #turning movie  titles to a column to handle it easier 
    input_titles=pd.melt(prefs, id_vars=["User"], var_name='title')
    #I filtered for values higher than r
    high_rank=input_titles.loc[input_titles['value'] >= r]
    #I counted the number of reviews for ratings 3,4,5(or any arbitrary ratings this is for our default case) grouped by title, and summed them to get the aggregate number of reviews higher than r
    rank=high_rank.groupby(['title','value']).count().unstack().sum(axis=1)
    #the same procedure but for all ratings irrespective of our threshhold r to get the percentage later
    total=input_titles.groupby(['title','value']).count().unstack().sum(axis=1)
    #here I used the total number of rankings higher than r divided by the total number of rankings, and sorted the values
    scores=(rank/total).sort_values(ascending=False)
    return scores[0:n]

In [10]:
topPerc(input_file)

title
318: Shawshank Redemption, The (1994)                      0.700000
541: Blade Runner (1982)                                   0.666667
260: Star Wars: Episode IV - A New Hope (1977)             0.666667
1210: Star Wars: Episode VI - Return of the Jedi (1983)    0.642857
1198: Raiders of the Lost Ark (1981)                       0.636364
dtype: float64

In [11]:
# Rating Count: Count the number of ratings for each movie, order with 
# the most number of ratings first, and submit the top n.

def topCount(prefs,n=5):
    #I just took the numeric values of the my data, which are the ratings, counted them and sorted
    scores=prefs.count(numeric_only=True).sort_values(ascending=False)
    return scores[0:n]

In [12]:
topCount(input_file)

1: Toy Story (1995)                                        17
593: Silence of the Lambs, The (1991)                      16
260: Star Wars: Episode IV - A New Hope (1977)             15
1210: Star Wars: Episode VI - Return of the Jedi (1983)    14
780: Independence Day (ID4) (1996)                         13
dtype: int64

In [13]:
# Top 5 Movies related: Calculate movies that most often occur with 
# other movie, Star Wars: Episode IV - A New Hope (1977) by defautl
# using the (x+y)/x method. In other words, for each movie, calculate the 
# percentage of the other movie raters who also rated that movie. Order with 
# the highest percentage first, and submit the top 5.

def topOccur(prefs,x='260: Star Wars: Episode IV - A New Hope (1977)',n=5):
    #Here I am dropping the NA values because we need to take the users that bought/watched this one movie we select as our item #1
    prefs = prefs[prefs[x].notna()]
    #we count them
    items=len(prefs[x].notna())
    #I need two empty lists where I will be storing the number of ratings(NAs excluded) and the title of the movie
    my_movie=[]
    otro_movies=[]
    #starting a loop that takes all the columns of our prefs(input_file), neglecting the first column, which is User names
    for movie in prefs.columns[1:input_file.shape[0]]:
        #also we need to make sure the loop is not comparing the person who watched the movie to themselves, so the condition is that it's not the same movie
        if (movie != x):
            #again dropping NAs and storing the movie titles that rated both
            test=prefs[prefs[movie].notna()]
            #appending the lists one to store the length(basically count) for the movie titles we had
            my_movie.append(len(test[movie]))
            #the other just all so that we can calculate the percentage
            otro_movies.append(movie)
    #to handle it easier we are creating a dataframe where we have count of ratings and movie title        
    test=pd.DataFrame(my_movie,otro_movies)
    #now we just calculate percentage, and sort them from highest
    scores=(test.iloc[:,0]/items).sort_values(ascending=False)
    return scores[0:n]


In [15]:
topOccur(input_file,x='260: Star Wars: Episode IV - A New Hope (1977)',n=5)

1: Toy Story (1995)                                        0.933333
1210: Star Wars: Episode VI - Return of the Jedi (1983)    0.866667
593: Silence of the Lambs, The (1991)                      0.800000
780: Independence Day (ID4) (1996)                         0.733333
2916: Total Recall (1990)                                  0.666667
Name: 0, dtype: float64