In [2]:
import numpy as np
import pandas as pd

In [3]:
rating_df = pd.read_csv('data/ratings.csv')
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [25]:
rating_df.shape

(100004, 4)

In [26]:
rating_df.rating.value_counts()

4.0    28750
3.0    20064
5.0    15095
3.5    10538
4.5     7723
2.0     7271
2.5     4449
1.0     3326
1.5     1687
0.5     1101
Name: rating, dtype: int64

In [27]:
rating_df.nunique()

userId         671
movieId       9066
rating          10
timestamp    78141
dtype: int64

In [28]:
# sort data frame by timestemp for splitting
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
52635,383,21,3.0,789652009
52641,383,47,5.0,789652009
52684,383,1079,3.0,789652009
56907,409,21,5.0,828212412
56909,409,25,4.0,828212412


In [29]:
# map user id and movie id to integer starting from 0 to N (num of users) and M (num of movies)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [30]:
user_ids

array([382, 382, 382, ..., 250, 250, 250], dtype=int64)

In [31]:
movie_ids

array([  20,   45,  870, ..., 7648, 4604, 1352], dtype=int64)

In [32]:
#training/ val
num_train = int(len(user_ids) * 0.8)
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]

In [33]:
# set up user-movie matrix
num_users = user_ids.max() + 1
num_movies = movie_ids.max() + 1

#2-d matrix
user2movie = np.zeros([num_users, num_movies])
user2movie[train_user_ids, train_movie_ids] = train_ratings

In [34]:
print("num_users:",num_users)
print("num_movies:",num_movies)
print("train_user_ids:",train_user_ids)
print("train_movie_ids:",train_movie_ids)
print("train_ratings:",train_ratings)
user2movie

num_users: 671
num_movies: 9066
train_user_ids: [382 382 382 ... 358 358 358]
train_movie_ids: [  20   45  870 ... 2540 3259  819]
train_ratings: [3.  5.  3.  ... 4.5 0.5 2. ]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [35]:
# compute similarity
def pearson_correlation(x,y):
    '''
    Compute the pearson correlation for x and y.
    args:
        x: np.array
        y: np.array
    '''
    
    # set up filters for entries where x and y are both non zero
    filt = (x != 0) * (y != 0)
    
    # compute mean for x and y
    x_mean = x.sum() / x[x!=0].shape[0]
    y_mean = y.sum() / y[y!=0].shape[0]
    
    # consider only entries based on the previous filter
    x = x[filt]
    y = y[filt]
    
    # compute correlation
    np.seterr(invalid='ignore')
    corr = np.sum((x - x_mean) * (y-y_mean)) / (np.sum((y-y_mean) ** 2) * np.sum((x-x_mean) ** 2) ) ** 0.5
    
    return corr

def compute_user_similarity_matrix(user2movie):
    '''
    Compute user similarity matrix
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        similarity_matrix: np.array, user 2 user similarity
    
    '''
    # initialize similarity matrix
    similarity_matrix = np.zeros([num_users, num_users])
    
    for i in range(len(user2movie)):
        for j in range(i, len(user2movie)):
            
            # compute correlation
            corr = pearson_correlation(user2movie[i],user2movie[j])

            # store correlation in matrix
            similarity_matrix[i,j] = corr
            similarity_matrix[j,i] = corr
            
    return similarity_matrix

similarity_matrix = compute_user_similarity_matrix(user2movie)
similarity_matrix

array([[ 1.        ,         nan,         nan, ...,  1.        ,
                nan, -1.        ],
       [        nan,  1.        , -0.01606095, ..., -0.95740201,
        -0.49446359,  0.57048961],
       [        nan, -0.01606095,  1.        , ...,  0.97327676,
        -0.08336072,  0.52261992],
       ...,
       [ 1.        , -0.95740201,  0.97327676, ...,  1.        ,
         1.        ,  0.74128209],
       [        nan, -0.49446359, -0.08336072, ...,  1.        ,
         1.        ,  0.26704067],
       [-1.        ,  0.57048961,  0.52261992, ...,  0.74128209,
         0.26704067,  1.        ]])

In [36]:
#try
#similarity_matrix[np.arange(num_users), np.arange(num_users)] = 99
#similarity_matrix

In [37]:
# set similarity with itself to 0 (set diagonal =0)
similarity_matrix[np.arange(num_users), np.arange(num_users)] = 0
similarity_matrix[np.isnan(similarity_matrix)] = 0
similarity_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        , -1.        ],
       [ 0.        ,  0.        , -0.01606095, ..., -0.95740201,
        -0.49446359,  0.57048961],
       [ 0.        , -0.01606095,  0.        , ...,  0.97327676,
        -0.08336072,  0.52261992],
       ...,
       [ 1.        , -0.95740201,  0.97327676, ...,  0.        ,
         1.        ,  0.74128209],
       [ 0.        , -0.49446359, -0.08336072, ...,  1.        ,
         0.        ,  0.26704067],
       [-1.        ,  0.57048961,  0.52261992, ...,  0.74128209,
         0.26704067,  0.        ]])

In [39]:
def compute_ucf(user2movie, similarity_matrix):
    '''
    Compute prediction scores for all movies
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        scores: np.array, predicted scores of each video for the target user
    '''

    # compute mean rating yk, ignoring zero entries, shape:(num_users)
    mean_ratings = np.sum(user2movie, axis=1) / (user2movie != 0 ).sum(axis=1)
    
    # compute ykj - yk, shape:(num_users, num_movies)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, 1)
    
    # compute sum of similarities Σsimik, (num_users,)
    sim_sum = np.sum(np.abs(similarity_matrix), axis=1)
        
    # don't sum the unknown entries, set them to 0        
    user2movie_diff[np.where(user2movie == 0) ] = 0

    # compute weighted sum of rating diff (num_users, num_movies)
    weighted_sum = np.matmul(similarity_matrix, user2movie_diff) / np.expand_dims(sim_sum, 1)

    # add weighted sum to mean ratings
    scores =  weighted_sum + np.expand_dims(mean_ratings, 1)
    
    return  scores

predictions = compute_ucf(user2movie, similarity_matrix) 
predictions

array([[2.60394488, 2.52142328, 2.53811983, ..., 2.55      , 2.55      ,
        2.55      ],
       [3.51715353, 3.4744639 , 3.4901902 , ..., 3.48684211, 3.48684211,
        3.48684211],
       [3.56477871, 3.56355235, 3.55132752, ..., 3.56862745, 3.56862745,
        3.56862745],
       ...,
       [3.42098868, 3.31653474, 3.328678  , ..., 3.35135135, 3.35135135,
        3.35135135],
       [3.89694577, 3.80882204, 3.78673663, ..., 3.80645161, 3.80645161,
        3.80645161],
       [4.11243842, 3.90790131, 3.88788021, ..., 3.9173913 , 3.9173913 ,
        3.9173913 ]])

In [40]:
def dcg_at_k(r, k):
    '''
    Compute DCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed dcg
        
    '''
    r = r[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    return dcg


In [41]:
def ndcg_at_k(r, k, method=0):
    '''
    Compute NDCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed ndcg
        
    '''
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)

    return dcg_at_k(r, k) / dcg_max

In [42]:
# compute average ndcg for all users
def evaluate_prediction(predictions):
    '''
    Return the average ndcg for each users
    args:
        predictions: np.array user-item predictions
    returns:
        ndcg: float, computed NDCG
    '''
    ndcgs = []
    # iterate
    for target_user in np.unique(val_user_ids):
        # get movie ids and ratings associated with the target user.
        target_val_movie_ids = val_movie_ids[val_user_ids == target_user] 
        target_val_ratings = val_ratings[val_user_ids == target_user] 
        
        # compute ndcg for this user
        ndcg = ndcg_at_k(target_val_ratings[np.argsort(-predictions[target_user][target_val_movie_ids])], k=30)
        ndcgs.append(ndcg)
    ndcg = np.mean(ndcgs)
    return ndcg
evaluate_prediction(predictions)

0.8326887437634239

In [43]:
def compute_item_similarity_matrix(user2movie):
    '''
    Compute item similarity matrix
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        similarity_matrix: np.array, item-item similarity
    
    '''
    # compute mean for each user
    x_mean = user2movie.sum(axis=0) / (user2movie!=0).sum(axis=0)
    
    # set up filter for zero entries
    filt = (user2movie==0)
    
    # compute rating difference
    rating_diff = user2movie - np.expand_dims(x_mean, axis=0)
    rating_diff[filt] = 0

    # compute similarity
    similarity_matrix = np.matmul(rating_diff.T, rating_diff) / (np.matmul(rating_diff.T  ** 2, (rating_diff != 0)) * np.matmul(rating_diff.T  ** 2, (rating_diff != 0)).T) ** 0.5

    return similarity_matrix

similarity_matrix = compute_item_similarity_matrix(user2movie)
similarity_matrix

array([[1.        , 0.32949699, 0.14883085, ...,        nan,        nan,
               nan],
       [0.32949699, 1.        , 0.11771157, ...,        nan,        nan,
               nan],
       [0.14883085, 0.11771157, 1.        , ...,        nan,        nan,
               nan],
       ...,
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan]])

In [44]:
# set similarity with itself to 0
similarity_matrix[np.arange(num_movies), np.arange(num_movies)] = 0
similarity_matrix[np.isnan(similarity_matrix)] = 0
similarity_matrix[:10]

array([[ 0.        ,  0.32949699,  0.14883085, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.32949699,  0.        ,  0.11771157, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.14883085,  0.11771157,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.75689798,  0.79614979,  0.71287669, ...,  0.        ,
         0.        ,  0.        ],
       [-0.21912884,  0.5874723 ,  0.34578862, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02433877,  0.43192313, -0.08695837, ...,  0.        ,
         0.        ,  0.        ]])

In [156]:
def compute_icf(user2movie, similarity_matrix):
    '''
    Compute prediction scores for all movies with item-based CF.
    args:
        user2movie: np.array, user-movie rating matrix
    returns:
        scores: np.array, predicted scores of each video for the target user
    '''

    # compute mean rating yk, ignoring zero entries, shape:(num_movies)
    mean_ratings = np.sum(user2movie, axis=0) / (user2movie != 0 ).sum(axis=0)
    
    # compute ykj - yk, shape:(num_users, num_movies)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, axis=0)
    
    # compute sum of similarities Σsimik, (num_movies,)
    sim_sum = np.sum(np.abs(similarity_matrix), axis=1)
    
    # don't sum the unknown entries, set them to 0
    user2movie_diff[np.where(user2movie == 0) ] = 0
    
    # compute weighted sum of rating diff (num_users, num_movies)
    weighted_sum = np.matmul(user2movie_diff, similarity_matrix) / np.expand_dims(sim_sum, axis=0)

    # add weighted sum to mean ratings
    scores =  weighted_sum + np.expand_dims(mean_ratings, 0)
    
    return  scores

predictions = compute_icf(user2movie, similarity_matrix) 
predictions

array([[ nan, 2.5 ,  nan,  nan, 3.25],
       [ nan, 2.5 ,  nan,  nan, 2.75],
       [ nan, 3.25,  nan,  nan, 2.25],
       [ nan, 1.75,  nan,  nan, 2.75]])

In [157]:
evaluate_prediction(predictions)

1.0