In [1]:
import pandas as pd
import numpy as np
import heapq
from operator import itemgetter

##############################################################################
#      MOVIE RECOMMENDATION SYSTEM - SMALL SUBSET OF NETFLIX PRIZE DATE      #
##############################################################################

# start by putting training and testing netflix data into a dataframe
cols = ['movie_id', 'user_id', 'rating']
traindf = pd.read_csv('netflix_subset/TrainingRatings.txt', names=cols)
testdf = pd.read_csv('netflix_subset/TestingRatings.txt', names=cols)
traindf.head()

Unnamed: 0,movie_id,user_id,rating
0,8,1744889,1.0
1,8,1395430,2.0
2,8,1205593,4.0
3,8,1488844,4.0
4,8,1447354,1.0


In [2]:
# print the size of the total dataset, first
print('train df: ', traindf.shape, '| test df: ', testdf.shape)
# then let's look at the the number of users and the number of movies
num_users = traindf.user_id.unique().shape[0]
num_movies = traindf.movie_id.unique().shape[0]

print('\nnumber of users: ', num_users, '| number of movies: ', num_movies)

train df:  (3255352, 3) | test df:  (100478, 3)

number of users:  28978 | number of movies:  1821


In [3]:
import csv 

titlecols = ['movie_id', 'year', 'title']

# let's make a dictionary to map movie-ids to their titles

# it's converting the year to float to deal w NaN; not going to worry about it bc i don't need the year anyway
# doing this to deal with commas within the movie titles - 'The Godfather, Part II' 
with open("netflix_subset/movietitles.csv", 'rt', encoding='latin1') as infile, open('netflix_subset/formatted_titles.csv', 'wt') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    for line in reader:
        newline = line[:2] + [','.join(line[2:])]
        writer.writerow(newline)


titles = pd.read_csv('netflix_subset/formatted_titles.csv', names=titlecols, encoding='latin1')

tdict = pd.Series(titles.title.values, index=titles.movie_id).to_dict()

titles.loc[titles.movie_id == 3290]

Unnamed: 0,movie_id,year,title
3289,3290,1974.0,"The Godfather, Part II"


In [4]:
# both userIDs and movieIDs increase by random increments so putting them into sorted lists for the user-item matrix
userIDs = sorted(traindf.user_id.unique())
movieIDs = sorted(traindf.movie_id.unique())


In [5]:
# before getting into the collab filtering, let's find the top rated movies across all users
# using the bayesian avg
from collections import defaultdict


reviews = defaultdict(list)
# get a list of all reviews for a certain movie-id and add it to a dictionary
def get_reviews(mov_id, df, revdict):
    revdict[mov_id] = df.loc[df['movie_id'] == mov_id, 'rating']

# function to fill the dictionary       
def create_rev_dict(id_list, df, revdict):
    for mid in id_list:
        get_reviews(mid, df, revdict)
       

In [6]:
#########################‹########################

create_rev_dict(movieIDs, traindf, reviews)

###################################################

In [7]:

# in order to get the bayes avg we need to compute priors 

# returns the movie-id, avg-rating(1-5), and the number of ratings
def calc_avgs(id_list, revdict):
    for m in id_list:
        avg = sum(revdict[m]) / float(len(revdict[m]))
        yield (m, avg, len(revdict[m]))

        
# put them into a df so we can easily compute the mean()
averages = pd.DataFrame(calc_avgs(movieIDs, reviews), columns=['m_id', 'avg_rating', 'num_reviews'])

# add the movie titles to df 
averages['mtitle'] = averages['m_id'].map(tdict)
# we need the avg number of reviewers for each movie (c)
print("average number of netflix reviews per movie: ", round(averages.num_reviews.mean(), 3))
# and the mean of all the movies' avg ratings
print("average rating for netflix movies: ", round(averages.avg_rating.mean(), 3))

# this gives a naive approach to ranking the movies by their avg rating bc it doesn't take number of reviews into account
# averages.sort_values(['avg_rating'], ascending=False).head(10)

average number of netflix reviews per movie:  1787.673
average rating for netflix movies:  3.058


In [8]:
# where c are priors - c: avg num reviews/movie, m: avg rating
def bayes_avg(c, m, id_list, revdict):
    for m_id in id_list:
        avg = ((c * m) + sum(revdict[m_id])) / float(c + len(revdict[m_id]))
        yield (m_id, avg, len(revdict[m_id]))

def bayes_top_n(n, idlist, revdict, c, m):
    return heapq.nlargest(n, bayes_avg(c, m, idlist, revdict), key=itemgetter(1))


# bayesian avg to compute most popular movies on netflix 
print ('########################  TOP RATED NETFLIX MOVIES - BAYESIAN AVG ############################')
for movid, avg, num in bayes_top_n(10, idlist=movieIDs, revdict=reviews, c=1787, m=3):
    print("%0.3f average rating (%i reviews) %s" % (avg, num, tdict[movid]))

########################  TOP RATED NETFLIX MOVIES - BAYESIAN AVG ############################
4.348 average rating (20691 reviews) The Godfather
4.235 average rating (21209 reviews) The Incredibles
4.215 average rating (10430 reviews) The Sopranos: Season 1
4.214 average rating (17292 reviews) The Godfather, Part II
4.189 average rating (9579 reviews) The Sopranos: Season 3
4.052 average rating (13771 reviews) Aladdin: Platinum Edition
4.028 average rating (15086 reviews) Million Dollar Baby
4.024 average rating (23005 reviews) Seven
4.013 average rating (5941 reviews) Six Feet Under: Season 2
4.002 average rating (16170 reviews) Alien: Collector's Edition


In [9]:
# careful !! don't run this one again (it takes a long time)

# # create a user-item matrix so we can compute cosine similarities
traintrix = np.zeros((num_users, num_movies))

# # for each row, find the user/movie ID's index in its respective list and add it to those indices in the u-i matrix


for row in traindf.itertuples():
    traintrix[userIDs.index(row[2]), movieIDs.index(row[1])] = row[3]
    
# # create user-item matrix with test dataset
testtrix = np.zeros((num_users, num_movies))

for row in testdf.itertuples():
    testtrix[userIDs.index(row[2]), movieIDs.index(row[1])] = row[3]
    
 

In [11]:
########## testing ##########
print(traintrix.shape)
# the first ten entries by the first user (corresponding to 10 smallest movie-ids) - 0 for null values
print(traintrix[0,:10])
# to see all of the ratings by user 7 -- 8 & 28 are the first two movie-ids
traindf.loc[traindf['user_id']==7].head()

##############################

(28978, 1821)
[ 5.  4.  0.  0.  0.  0.  0.  0.  0.  0.]


Unnamed: 0,movie_id,user_id,rating
1429,8,7,5.0
8911,28,7,4.0
33639,185,7,4.0
82974,636,7,4.0
129845,1046,7,3.0


In [12]:
# compute cosine similarity between users (or items)

def compute_sim(ui_matrix, kind='user', epsilon=1e-9):
    if kind=='user':
        sim = ui_matrix.dot(ui_matrix.T) + epsilon
    elif kind == 'item':
        sim = ui_matrix.T.dot(ui_matrix) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

user_similarity = compute_sim(traintrix, kind='user')
#print(user_similarity[:4, :4])

In [13]:
# now we make predictions! 
# let's see the difference between normalizing by the mean or not(predict2)
def predict(ui_matrix, similarity, kind='user'):
    if kind=='user':
        mean_user_rating = ui_matrix.mean(axis=1)
        ratings_diff = (ui_matrix - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind=='item':
        pred = ui_matrix.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

%timeit predict(traintrix, user_similarity, kind='user')
# incredibly slow

1 loop, best of 3: 1min 19s per loop


In [14]:
def predict2(ui_matrix, similarity, kind='user'):
    if kind=='user':
        return similarity.dot(ui_matrix) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind=='item':
        return ui_matrix.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
%timeit predict2(traintrix, user_similarity, kind='user')
# only a lil faster

1 loop, best of 3: 1min 12s per loop


In [15]:
usr_pred = predict(traintrix, user_similarity, kind='user')
usr_prediction2 = predict2(traintrix, user_similarity, kind='user')

#print(rmse(usrpred2, testtrix))

In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt
# now let's evaluate accuracy
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print('Root Mean Squared Error w Normalization: ', round(rmse(usr_pred, testtrix), 3), '| Root Mean Squared Error w.o Normalization: ', round(rmse(usr_prediction2, testtrix), 3))

Root Mean Squared Error w Normalization:  2.512 | Root Mean Squared Error w.o Normalization:  2.545


In [19]:
# let's see how it works for item-based collaborative filtering

item_similarity = compute_sim(traintrix, kind='item')

In [27]:
item_pred = predict(traintrix, item_similarity, kind='item')
%timeit predict(traintrix, item_similarity, kind='item')
# a lot faster, predictably (way fewer items than users) 
# 'item' is the same in both predict() functions

1 loop, best of 3: 2.78 s per loop


In [28]:
print('Root Mean Squared Error (item-based): ', round(rmse(item_pred, testtrix), 3))
# we lose a bit of accuracy but there's obviously a time trade-off

Root Mean Squared Error (item-based):  2.967
