In [1]:
import numpy as np
import pandas as pd

In [3]:
rating_df = pd.read_csv('data/ratings_small.csv')
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,2,33,3.0,1260759179
2,3,32,3.0,1260759182
3,1,45,2.0,1260759185
4,3,31,4.0,1260759205
5,4,45,2.0,1260759151
6,2,31,2.0,1260759187
7,3,55,2.0,1260759148
8,4,55,3.5,1260759125
9,1,32,2.0,1260759131


In [4]:
rating_df.shape

(10, 4)

In [5]:
rating_df.rating.value_counts()

2.0    5
3.0    2
3.5    1
4.0    1
2.5    1
Name: rating, dtype: int64

In [6]:
rating_df.nunique()

userId        4
movieId       5
rating        5
timestamp    10
dtype: int64

In [7]:
# sort data frame by timestemp for splitting
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
8,4,55,3.5,1260759125
9,1,32,2.0,1260759131
0,1,31,2.5,1260759144
7,3,55,2.0,1260759148
5,4,45,2.0,1260759151


In [8]:
# map user id and movie id to integer starting from 0 to N (num of users) and M (num of movies)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [9]:
user_ids

array([3, 0, 0, 2, 3, 1, 2, 0, 1, 2], dtype=int64)

In [10]:
movie_ids

array([4, 1, 0, 4, 3, 2, 1, 3, 0, 0], dtype=int64)

In [11]:
#training/ val
num_train = int(len(user_ids) * 0.8)
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]

In [12]:
# set up user-movie matrix
num_users = user_ids.max() + 1
num_movies = movie_ids.max() + 1

#2-d matrix
user2movie = np.zeros([num_users, num_movies])
user2movie[train_user_ids, train_movie_ids] = train_ratings

In [13]:
print("num_users:",num_users)
print("num_movies:",num_movies)
print("train_user_ids:",train_user_ids)
print("train_movie_ids:",train_movie_ids)
print("train_ratings:",train_ratings)
user2movie

num_users: 4
num_movies: 5
train_user_ids: [3 0 0 2 3 1 2 0]
train_movie_ids: [4 1 0 4 3 2 1 3]
train_ratings: [3.5 2.  2.5 2.  2.  3.  3.  2. ]


array([[2.5, 2. , 0. , 2. , 0. ],
       [0. , 0. , 3. , 0. , 0. ],
       [0. , 3. , 0. , 0. , 2. ],
       [0. , 0. , 0. , 2. , 3.5]])

In [14]:
# compute similarity
def pearson_correlation(x,y):
    '''
    Compute the pearson correlation for x and y.
    args:
        x: np.array
        y: np.array
    '''
    
    # set up filters for entries where x and y are both non zero
    filt = (x != 0) * (y != 0)
    
    # compute mean for x and y
    x_mean = x.sum() / x[x!=0].shape[0]
    y_mean = y.sum() / y[y!=0].shape[0]
    
    # consider only entries based on the previous filter
    x = x[filt]
    y = y[filt]
    
    # compute correlation
    np.seterr(invalid='ignore')
    corr = np.sum((x - x_mean) * (y-y_mean)) / (np.sum((y-y_mean) ** 2) * np.sum((x-x_mean) ** 2) ) ** 0.5
    
    return corr

def compute_user_similarity_matrix(user2movie):
    '''
    Compute user similarity matrix
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        similarity_matrix: np.array, user 2 user similarity
    
    '''
    # initialize similarity matrix
    similarity_matrix = np.zeros([num_users, num_users])
    
    for i in range(len(user2movie)):
        for j in range(i, len(user2movie)):
            
            # compute correlation
            corr = pearson_correlation(user2movie[i],user2movie[j])

            # store correlation in matrix
            similarity_matrix[i,j] = corr
            similarity_matrix[j,i] = corr
            
    return similarity_matrix

similarity_matrix = compute_user_similarity_matrix(user2movie)
similarity_matrix

array([[ 1., nan, -1.,  1.],
       [nan, nan, nan, nan],
       [-1., nan,  1., -1.],
       [ 1., nan, -1.,  1.]])

In [15]:
#try
#similarity_matrix[np.arange(num_users), np.arange(num_users)] = 99
#similarity_matrix

In [16]:
# set similarity with itself to 0 (set diagonal =0)
similarity_matrix[np.arange(num_users), np.arange(num_users)] = 0
similarity_matrix[np.isnan(similarity_matrix)] = 0
similarity_matrix

array([[ 0.,  0., -1.,  1.],
       [ 0.,  0.,  0.,  0.],
       [-1.,  0.,  0., -1.],
       [ 1.,  0., -1.,  0.]])

In [17]:
user2movie

array([[2.5, 2. , 0. , 2. , 0. ],
       [0. , 0. , 3. , 0. , 0. ],
       [0. , 3. , 0. , 0. , 2. ],
       [0. , 0. , 0. , 2. , 3.5]])

In [18]:
#user based cf
def compute_ucf(user2movie, similarity_matrix):
    '''
    Compute prediction scores for all movies
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        scores: np.array, predicted scores of each video for the target user
    '''

    # compute mean rating yk, ignoring zero entries, shape:(num_users)
    mean_ratings = np.sum(user2movie, axis=1) / (user2movie != 0 ).sum(axis=1)
    
    # compute ykj - yk, shape:(num_users, num_movies)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, 1)
    
    # compute sum of similarities Σsimik, (num_users,)
    sim_sum = np.sum(np.abs(similarity_matrix), axis=1)
        
    # don't sum the unknown entries, set them to 0        
    user2movie_diff[np.where(user2movie == 0) ] = 0

    # compute weighted sum of rating diff (num_users, num_movies)
    weighted_sum = np.matmul(similarity_matrix, user2movie_diff) / np.expand_dims(sim_sum, 1)

    # add weighted sum to mean ratings
    scores =  weighted_sum + np.expand_dims(mean_ratings, 1)
    
    return  scores

predictions = compute_ucf(user2movie, similarity_matrix) 
predictions

array([[2.16666667, 1.91666667, 2.16666667, 1.79166667, 2.79166667],
       [       nan,        nan,        nan,        nan,        nan],
       [2.33333333, 2.58333333, 2.5       , 2.95833333, 2.125     ],
       [2.91666667, 2.41666667, 2.75      , 2.66666667, 3.        ]])

In [19]:
def compute_item_similarity_matrix(user2movie):
    '''
    Compute item similarity matrix
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        similarity_matrix: np.array, item-item similarity
    
    '''
    # compute mean for each user
    x_mean = user2movie.sum(axis=0) / (user2movie!=0).sum(axis=0)
    
    # set up filter for zero entries
    filt = (user2movie==0)
    
    # compute rating difference
    rating_diff = user2movie - np.expand_dims(x_mean, axis=0)
    rating_diff[filt] = 0

    # compute similarity
    similarity_matrix = np.matmul(rating_diff.T, rating_diff) / (np.matmul(rating_diff.T  ** 2, (rating_diff != 0)) * np.matmul(rating_diff.T  ** 2, (rating_diff != 0)).T) ** 0.5

    return similarity_matrix

similarity_matrix = compute_item_similarity_matrix(user2movie)
similarity_matrix

array([[nan, nan, nan, nan, nan],
       [nan,  1., nan, nan, -1.],
       [nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan],
       [nan, -1., nan, nan,  1.]])

In [36]:
# set similarity with itself to 0
similarity_matrix[np.arange(num_movies), np.arange(num_movies)] = 0
similarity_matrix[np.isnan(similarity_matrix)] = 0
similarity_matrix

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0., -1.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0., -1.,  0.,  0.,  0.]])

In [37]:
# item based cf
def compute_icf(user2movie, similarity_matrix):
    '''
    Compute prediction scores for all movies with item-based CF.
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        scores: np.array, predicted scores of each video for the target user
    '''

    # compute mean rating yk, ignoring zero entries, shape:(num_movies)
    mean_ratings = np.sum(user2movie, axis=0) / (user2movie != 0 ).sum(axis=0)
    
    # compute ykj - yk, shape:(num_users, num_movies)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, axis=0)
    
    # compute sum of similarities Σsimik, (num_movies,)
    sim_sum = np.sum(np.abs(similarity_matrix), axis=1)
    
    # don't sum the unknown entries, set them to 0
    user2movie_diff[np.where(user2movie == 0) ] = 0
    
    # compute weighted sum of rating diff (num_users, num_movies)
    weighted_sum = np.matmul(user2movie_diff, similarity_matrix) / np.expand_dims(sim_sum, axis=0)

    # add weighted sum to mean ratings
    scores =  weighted_sum + np.expand_dims(mean_ratings, 0)
    
    return  scores

predictions = compute_icf(user2movie, similarity_matrix) 
predictions

array([[ nan, 2.5 ,  nan,  nan, 3.25],
       [ nan, 2.5 ,  nan,  nan, 2.75],
       [ nan, 3.25,  nan,  nan, 2.25],
       [ nan, 1.75,  nan,  nan, 2.75]])