In [35]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [23]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [25]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

943 users
1682 items


In [26]:
# Varaible ratings holds our user-item rating matrix. Similar to our utility matrix.

ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [28]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

#This gives us an idea of how sparse our matrix is. This means that 6.3% of the user-item ratings have a value.

Sparsity: 6.30%


In [30]:
#Function to split our training and testing data. 

def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

In [31]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [40]:
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')
# This matrix is actually very large , we are just printing first 4 rows and columns for representation.
print(item_similarity[:4 ,:4])

[[1.         0.38003054 0.33412145 0.45924272]
 [0.38003054 1.         0.25924382 0.46065394]
 [0.33412145 0.25924382 1.         0.33497463]
 [0.45924272 0.46065394 0.33497463 1.        ]]


In [34]:
# The above obtained matrix is our similarity matrix. It gives us the similarity between the different possible
# items present in our dataset.

def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [36]:
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [38]:
# Calculate the item-item similarity matrix
item_prediction = predict_fast_simple(train, item_similarity, kind='item')

# Calculate the user-user similarity matrix
user_prediction = predict_fast_simple(train, user_similarity, kind='user')

print('User-based CF MSE: ' + str(get_mse(user_prediction, test)))
print('Item-based CF MSE: ' + str(get_mse(item_prediction, test)))

User-based CF MSE: 8.454547772457058
Item-based CF MSE: 11.608851716966937


In [None]:
# For user-based collaborative filtering, the user-similarity matrix will consist of cosine distance that measures
# the similarity between any two pairs of users. 
# Likewise, the item-similarity matrix will measure the similarity between any two pairs of items.

# Here we are calculating both the matrices and finding the MSE values.