In [1]:
import numpy as np
import pandas as pd
import math as mt
from sparsesvd import sparsesvd
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import *

# Load datasets 

In [2]:
movies_data = pd.read_csv('movies.csv')
ratings_data = pd.read_csv('ratings.csv')


In [3]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


# Join it into the dataframe

In [5]:
df = movies_data.join(ratings_data, rsuffix = '_r')

In [6]:
df.head()

Unnamed: 0,movieId,title,genres,userId,movieId_r,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,2,3.5,1112486027
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,29,3.5,1112484676
2,3,Grumpier Old Men (1995),Comedy|Romance,1,32,3.5,1112484819
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1,47,3.5,1112484727
4,5,Father of the Bride Part II (1995),Comedy,1,50,3.5,1112484580


In [7]:
table = df.pivot_table(columns = 'movieId_r', index = 'userId', values = 'rating').fillna(0)

In [8]:
table.head()

movieId_r,1,2,3,4,5,6,7,8,9,10,...,112623,112852,113453,114180,115617,116797,117511,117590,118696,125916
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
matrix = table.values

In [10]:
matrix

array([[0. , 3.5, 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 4. , ..., 0. , 0. , 0. ],
       [4. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [4. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

# Sparsity Check 

In [36]:
n_users = df.userId.unique().shape[0]
n_movies = df.movieId_r.unique().shape[0]
print 'Users: %d | Movies: %d' %(n_users,n_movies)

Users: 211 | Movies: 5258


In [37]:
sparsity = round(1.0 - len(df)/float(n_users * n_movies),3)
print 'The Sparsity level of the dataset is: ', str(sparsity * 100), '%'

The Sparsity level of the dataset is:  97.5 %


# SVD Calculation

In [11]:
MAX_Rows , MAX_Columns = matrix.shape

In [33]:
def computeSVD(urm,K):
    U, s, Vt = sparsesvd(urm, K)
    dim = (len(s),len(s))
    S = np.zeros(dim,dtype = np.float32)
    for i in xrange(0,len(s)):
        S[i,i] = mt.sqrt(s[i])
        
    U = csc_matrix(np.transpose(U),dtype = np.float32)
    S = csc_matrix(S,dtype = np.float32)
    Vt = csc_matrix(Vt, dtype = np.float32)
    return U , S ,Vt

def computeEstimatedRatings(urm, U, S, Vt, uTest, K, test):
    rightTerm = S*Vt 

    estimatedRatings = np.zeros(shape=(MAX_Rows,MAX_Columns), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        #we convert the vector to dense format in order to get the indices 
        #of the movies with the best estimated ratings 
        estimatedRatings[userTest, :] = prod.todense()
        recom = (-estimatedRatings[userTest, :]).argsort()[:10]
    return recom


In [34]:
K = 25
matrix_modified = csc_matrix(matrix , dtype = np.float32)
U, S, Vt = computeSVD(matrix_modified,K)
uTest = [4]
print "Recommendation for User id: " , uTest[0]
print "Predicted MovieIds: "
uTest_recommended_items = computeEstimatedRatings(matrix_modified,U,S,Vt,uTest,K,True)
print uTest_recommended_items
print uTest_recommended_items.shape
for i in uTest_recommended_items:
    #print i
    print movies_data[movies_data['movieId'] == i]

Recommendation for User id:  4
Predicted MovieIds: 
[222 902 306 412 890 612 591 503 328 502]
(10,)
     movieId                     title         genres
219      222  Circle of Friends (1995)  Drama|Romance
     movieId                          title         genres
885      902  Breakfast at Tiffany's (1961)  Drama|Romance
     movieId                                             title genres
303      306  Three Colors: Red (Trois couleurs: Rouge) (1994)  Drama
     movieId                         title genres
408      412  Age of Innocence, The (1993)  Drama
     movieId                             title    genres
873      890  Baton Rouge (Bâton rouge) (1988)  Thriller
     movieId                   title  genres
606      612  Pallbearer, The (1996)  Comedy
     movieId                    title                 genres
585      591  Tough and Deadly (1995)  Action|Drama|Thriller
     movieId                title genres
499      503  New Age, The (1994)  Drama
     movieId              