In [16]:
# Read the movies database
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

ratings_df = pd.read_csv('data/u.data', sep='\t', index_col=False, names=['userId', 'movieId', 'rating'])
movies_df = pd.read_csv('data/u.item', sep='|', header=0, index_col=False, encoding='ISO-8859-1', names= ['movieId', 'title', 'releaseDate','videoReleaseDate','IMDbURL','unknown','Action','Adventure','Animation',"Children'",'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'])

# Add a column with the centered ratings
average_df = ratings_df[['userId', 'rating']].groupby(['userId'], as_index=False).mean().rename(columns={'rating': 'average'})
ratings_df = pd.merge(ratings_df, average_df, on='userId', how='left')
ratings_df['rating_centered'] = ratings_df['rating'] - ratings_df['average']
ratings_df

  ratings_df = pd.read_csv('data/u.data', sep='\t', index_col=False, names=['userId', 'movieId', 'rating'])


Unnamed: 0,userId,movieId,rating,average,rating_centered
0,196,242,3,3.615385,-0.615385
1,186,302,3,3.413043,-0.413043
2,22,377,1,3.351562,-2.351562
3,244,51,2,3.651261,-1.651261
4,166,346,1,3.550000,-2.550000
...,...,...,...,...,...
99995,880,476,3,3.426630,-0.426630
99996,716,204,5,3.888476,1.111524
99997,276,1090,1,3.465251,-2.465251
99998,13,225,2,3.097484,-1.097484


In [17]:
# Build the matrix of user's scores to movies
# Remember: The rows are the users and the movies the columns
# We can use the ID's as the indexes of the array but the first column and row will be 0
nUsers = ratings_df.userId.unique().size
nMovies = ratings_df.movieId.unique().size

ratingsMatrix = np.zeros((nUsers+1, nMovies+1))
for index, rating in ratings_df.iterrows():
    ratingsMatrix[int(rating.userId), int(rating.movieId)] = rating.rating_centered

ratingsMatrix
    
    

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.38970588, -0.61029412, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.29032258,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.95454545,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.58928571, ...,  0.        ,
         0.        ,  0.        ]])

In [18]:
# Collaborative filtering - Memory based
# --------------------------------------

# Item view: I will recommend movies based on user choice

# Let's say that user's chooses movie 102 (check 50/500/449)
# What movies can we recommend him?

# Get the distances between movies 
# Because we are dealing with the columns (movies) we have to transpose the matrix
distances = cosine_distances(ratingsMatrix.T)

distances


array([[0.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 1.05836654, ..., 1.        , 0.94662278,
        1.03320282],
       [1.        , 1.05836654, 0.        , ..., 1.        , 0.99807373,
        0.96354762],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 0.94662278, 0.99807373, ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.03320282, 0.96354762, ..., 1.        , 1.        ,
        0.        ]])

In [19]:
myMovie = 50
print("My movie is: ", movies_df[movies_df.movieId == myMovie].title.iloc[0])
print()

# ###################################################################
# Insert here the code to get the 10 movies closer to my movie
# ###################################################################


# tf = TfidfVectorizer (analyzer='word', stop_words='english')
# fqMatrix = tf.fit_transform (movies_df['summary'])
# cosSim = cosine_similarity(fqMatrix, fqMatrix)

My movie is:  Star Wars (1977)



In [20]:
# Collaborative filtering - Model based
# --------------------------------------
# First calculate the latent factors matrix using the Alternate Least Squares algorithm
# Then make recommendations based on similarity

# Metaparameters
k = 100        # number of latent factors
l = 0.1        # lambda. The same value for x and y
accuracy = 0.999

# X and Y initialization
np.random.seed(42)
X = np.random.normal(size=(ratingsMatrix.shape[0], k))
Y = np.random.normal(size=(k, ratingsMatrix.shape[1]))

# Alternate Least Squares algorithm
converged = False
pL = np.Inf
while not converged:
    y = Y.T
    inv = np.linalg.inv(y.T.dot(y) + l*np.eye(k))
    for u in range(0, X.shape[0]):
        X[u] = ratingsMatrix[u,:].dot(y).dot(inv)
    
    inv = np.linalg.inv(X.T.dot(X) + l*np.eye(k))    
    for i in range(0, Y.shape[1]):
        Y[:,i] = ratingsMatrix[:,i].dot(X).dot(inv)
        
    L = np.square(ratingsMatrix - X.dot(Y)).sum()
    L = L + l * (np.square(np.linalg.norm(X)) + np.square(np.linalg.norm(Y)))
                     
    # Improvement stop criteria
    converged = (L / pL) > accuracy
    
    pL = L
    

In [21]:
# Let's make predictions
# Get the similarity matrix with the items latent factors
myMovie = 50
print("My movie is: ", movies_df[movies_df.movieId == myMovie].title.iloc[0])
print()

distances = cosine_distances(Y.T)

# #####################################################################
# Insert here the code to print the 10 movies closest to my movie
# #####################################################################

My movie is:  Star Wars (1977)

