# Install and load necesary packages

In [1]:
import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [2]:
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

# obtain top 500 users and top 500 items
user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(500).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(500).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1,186,302,3,891717742
3,244,51,2,880606923
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467


# Split dataset

## Randomly select one rating from each user as test set

In [3]:
# remap user and item ID
df['user_id'] = df.groupby('user_id').ngroup()
df['item_id'] = df.groupby('item_id').ngroup()

test_df = df.groupby('user_id').sample(1, random_state=1024)
train_df = df[~df.index.isin(test_df.index)]

In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
avg_num = df.groupby('user_id').size().mean()
density = df.shape[0] / (n_users * n_items)
min_ratings = df.rating.min()
max_ratings = df.rating.max()

print("The number of users: {}" .format(n_users))
print("The number of items: {}" .format(n_items))
print("Avg. # of rated Items/User: {}" .format(avg_num))
print("Density of data: {}" .format(density))
print("Ratings Range: {} - {}" .format(min_ratings, max_ratings))

The number of users: 500
The number of items: 500
Avg. # of rated Items/User: 129.914
Density of data: 0.259828
Ratings Range: 1 - 5


In [5]:
# Convert the format of datasets to matrices
# Train dataset
df_zeros = pd.DataFrame({
    'user_id': np.tile(np.arange(0, n_users), n_items), 
    'item_id': np.repeat(np.arange(0, n_items), n_users), 
    'rating': 0})
train_ds = df_zeros.merge(train_df, 
                          how='left', 
                          on=['user_id', 'item_id']).fillna(0.).pivot_table(
                              values='rating_y', 
                              index='user_id', 
                              columns='item_id').values
                           
# Test dataset
test_ds = df_zeros.merge(test_df, 
                         how='left', 
                         on=['user_id', 'item_id']).fillna(0.).pivot_table(
                             values='rating_y', 
                             index='user_id', 
                             columns='item_id').values

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

Construct the rating matrix based on train_df:
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [4. 3. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 4. 0.]]
Construct the rating matrix based on test_df:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [6]:
EPSILON = 1e-9

def user_corr(imputed_train_ds):
    '''
    Function for calculating user's similarity
    '''
    active_user_pearson_corr = np.zeros((imputed_train_ds.shape[0], imputed_train_ds.shape[0]))

    # Compute Pearson Correlation Coefficient of All Pairs of Users between active set and training dataset
    for i, user_i_vec in enumerate(imputed_train_ds):
        for j, user_j_vec in enumerate(imputed_train_ds):

            # ratings corated by the current pair od users
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # corrated item index, skip if there are no corrated ratings
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
            if len(corrated_index) == 0:
                continue

            # average value of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

            # compute pearson corr
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
            active_user_pearson_corr[i][j] = sim

    return active_user_pearson_corr

def predict(test_ds, imputed_train_ds, user_corr, k=20):
    '''
    Function for predicting ratings in test_ds
    '''

    # Predicting ratings of test set
    predicted_ds = np.zeros_like(test_ds)

    for (i, j), rating in np.ndenumerate(test_ds):

        if rating > 0:

            # only predict ratings on test set
            sim_user_ids = np.argsort(user_corr[i])[-1:-(k + 1):-1]

            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = user_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds[sim_user_ids]
            
            mask_rateditem_user = imputed_train_ds[i] != 0
            num_rated_items = mask_rateditem_user.astype(np.float32)
            user_mean = np.sum(imputed_train_ds[i, mask_rateditem_user]) / (num_rated_items.sum() + EPSILON)

            mask_nei_rated_items = sim_users != 0
            num_rated_per_user = mask_nei_rated_items.astype(np.float32)
            num_per_user = num_rated_per_user.sum(axis=1)

            sum_per_user = sim_users.sum(axis=1)
            sim_user_mean = sum_per_user / (num_per_user + EPSILON)
            
            mask_rated_j = sim_users[:, j] > 0
                            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)

            predicted_ds[i, j] = np.clip(user_based_pred, 0, 5)
            
    return predicted_ds

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Baseline - KNN based recommendation (Similarity Metric: Pearson Correlation Coefficient)

In [7]:
user_pearson_corr = user_corr(train_ds)
predicted_ds = predict(test_ds, train_ds, user_pearson_corr, k=20)

In [8]:
MAE, RMSE = evaluate(test_ds, predicted_ds)

print("===================== Baseline Result =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8471711011333851, RMSE: 1.092846045041526


# user-KNN based recommendation (Similarity Metric: Pearson Correlation Coefficient utilising item popularity)

In [9]:
EPSILON = 1e-9

def user_corr2(imputed_train_ds):
    '''
    Function for calculating user's similarity utilising item popularity
    '''
    active_user_pearson_corr = np.zeros((imputed_train_ds.shape[0], imputed_train_ds.shape[0]))
    # Step A: Compute item popularity for every items
    popularity_vec = np.count_nonzero(imputed_train_ds, axis=0)
    
    # Step B: Compute Pearson Correlation Coefficient of All Pairs of Users between active sets in training dataset
    
    # make a copy of the training dataset
    train_cur = imputed_train_ds.copy()
    
    # for each pairs of active users
    for i, user_i_vec in enumerate(train_cur):
        for j, user_j_vec in enumerate(train_cur):

            # a) Rating data making up
            # a) Step 1: find the union set of items voted by user i or user j
            i_rated = np.where(user_i_vec > 0)
            j_rated = np.where(user_j_vec > 0)
            rated_index = np.union1d(i_rated, j_rated)
            
            
            # a) Step 2: fill missing values of the union set with the average rating of corresponding user
             
                # average rating of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)
           
                # find the set of items voted by user i but not voted by user j and vice versa
            j_notrated = np.setdiff1d(i_rated,j_rated)
            i_notrated = np.setdiff1d(j_rated,i_rated)

                # fill missing values with the average rating of corresponding user
            user_i_vec[i_notrated] = mean_user_i
            user_j_vec[j_notrated] = mean_user_j
            
            # corrated item index, skip if there are no corrated ratings
            if len(rated_index) == 0:
                continue
                        
            
            # b) Compute similarity utilising item popularity
            
                # popularity significance weight of item t
            popularity_t = popularity_vec[rated_index]     
            weight_t = np.log(imputed_train_ds.shape[0]/(popularity_t + EPSILON))
            weight_t_sq = np.square(weight_t)
           

                # compute pearson correlation coefficient with item popularity weight
            user_i_sub_mean = user_i_vec[rated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[rated_index] - mean_user_j

            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)


            r_w_ui_sum_sqrt = np.sqrt(np.sum(weight_t_sq*r_ui_sub_r_i_sq))
            r_w_uj_sum_sqrt = np.sqrt(np.sum(weight_t_sq*r_uj_sub_r_j_sq))

            sim = np.sum(weight_t_sq*user_i_sub_mean * user_j_sub_mean) / (r_w_ui_sum_sqrt * r_w_uj_sum_sqrt + EPSILON)
            active_user_pearson_corr[i][j] = sim

    return active_user_pearson_corr

def predict2(test_ds, imputed_train_ds, user_corr, k=20):
    '''
    Function for predicting ratings in test_ds
    '''

    # Predicting ratings of test set
    predictions = np.zeros_like(test_ds)

    for (i, j), rating in np.ndenumerate(test_ds):

        if rating > 0:

            # only predict ratings on test set, 
            # find top-k most similar users as the current user, remove itself
            sim_user_ids = np.argsort(user_corr[i])[-1:-(k + 1):-1]

            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = user_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds[sim_user_ids]
            user_mean = np.sum(imputed_train_ds[i]) / (np.sum(np.clip(imputed_train_ds[i], 0, 1)) + EPSILON)
            sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

            # select the users who rated item j           
            mask_rated_j = sim_users[:, j] > 0
                            
            # sim(x, u) * (r_uj - mean_u)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)

            predictions[i, j] = np.clip(user_based_pred, 0, 5)
            
    return predictions

def evaluate2(test_ds, predictions):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE (Compute mean absolute error)
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predictions[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE (Compute root mean square error)
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predictions[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE


MAE = 0 # 0 is an intial value
RMSE = 0 # 0 is an intial value


user_pearson_corr = user_corr2(train_ds)
predictions = predict2(test_ds, train_ds, user_pearson_corr, k=20)
MAE, RMSE = evaluate2(test_ds, predictions)





# Recommendation list of top_N items

def recommend_active(prediction, dataforpredict, top_N=5, activeuser = 2):
    '''
    Function for generating the recommendation items list for active user a 
    based on the Top-N highest prediction value.
    '''
    # Make a copy of prediction set
    pred_cur = prediction.copy()
    
    # If the active user i rated item j already, we exclude item j from prediction/recommendations
    # by setting its value to zero
    for (i, j), rating in np.ndenumerate(dataforpredict):
        if rating >0:
            pred_cur[i][j] = 0
            
    # Make top_N recommendations to the active user by sorting the predictions of that users for all items        
    itemSortInd = pred_cur[activeuser,:].argsort()[::-1][:top_N]
    
    reclist = print('The index of Top', top_N, 'recommended items for active user', activeuser, 'are:', itemSortInd)
    
    return reclist


# the recommendation items list for active user 2 based on the Top-5 highest prediction value.
rec_active = recommend_active(predictions, test_ds, top_N=5, activeuser = 2)





def comRV(dataset, top_N=5):
    '''
    Function for listing the items with the Top-N highest composite recommendation values for a new user.
    '''
    EPSILON = 1e-9
    
    # Compute the item popularity for every items
    popularity_vec = np.count_nonzero(dataset, axis=0)
    
    # Compute the sum of rating for every items
    itemRateSum = dataset.sum(axis=0)
    
    # Compute the average of rating for every items
    mean_r_t = itemRateSum/popularity_vec
   
    
    # Compute composite recommendation value for every item
    comRV_t = popularity_vec*(mean_r_t + EPSILON)
    
    # Sort the comRV of all items and get the indexes of top_N items with the highest values
    itemSortInd = comRV_t.argsort()[::-1][:top_N]
    
    # List the Top-N most popular items and which are also rated well.
    reclist = print('The index of Top', top_N, 'recommended items for a new user are:', itemSortInd)
    
    return reclist

# recommendation list of top_5 items for new user 
# utilising composite recommendation values from training set
rec_new = comRV(train_ds, top_N=5)

The index of Top 5 recommended items for active user 2 are: [499 170 157 158 159]
The index of Top 5 recommended items for a new user are: [ 36 128 135  77  75]


## Print the MAE and RMSE of New Implementation

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7655375639836793, RMSE: 0.9874731529656616
