## Movie Ratings Prediction based on Cosine Similarity

In [2]:
#import statements
import numpy as np
import pandas as pd
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt



## Reading the data

In [15]:
data_df = pd.read_table('u.data', sep = '\t', names = ['user_id','item_id','rating','timestamp'])

In [16]:
n_users = data_df.user_id.unique().shape[0]
n_items = data_df.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) 

Number of users = 943 | Number of movies = 1682


## Training and Test Dataset

In [17]:
train_data, test_data = cv.train_test_split(data_df, test_size=0.25)

In [18]:
#Create two user-item matrices, one for training and another for testing

train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

## Cosine Similarity for:
## 1. User-user
## 2. Item-item

In [19]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [20]:
user_similarity = pd.DataFrame(user_similarity)
user_similarity = user_similarity.apply(lambda x: 1-x)
user_similarity = np.array(user_similarity)

## Prediction Algorithm

In [21]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    
    return pred

In [22]:
user_prediction = predict(test_data_matrix, user_similarity, type='user')
item_prediction = predict(train_data_matrix, item_similarity, type='item')

In [23]:
user_prediction

array([[ 0.61568379,  0.24581632,  0.15065831, ...,  0.05416212,
         0.05416212,  0.05416212],
       [ 0.42015   ,  0.0594122 ,  0.05927514, ..., -0.01096003,
        -0.01096003, -0.01096003],
       [ 0.29925932,  0.03985334,  0.02452377, ..., -0.03048723,
        -0.03048723, -0.03048723],
       ..., 
       [ 0.45510883,  0.06938018,  0.05061606, ..., -0.0396485 ,
        -0.0396485 , -0.0396485 ],
       [ 0.49193962,  0.12662389,  0.0471363 , ..., -0.01462906,
        -0.01462906, -0.01462906],
       [ 0.66241485,  0.27272928,  0.12429919, ...,  0.02248365,
         0.02248365,  0.02248365]])

In [68]:
item_prediction

array([[ 0.35329625,  0.35756835,  0.38020126, ...,  0.41463415,
         0.408919  ,  0.40224407],
       [ 0.08691923,  0.10033127,  0.09523011, ...,  0.10291493,
         0.10289459,  0.10294988],
       [ 0.05926121,  0.06107385,  0.0595991 , ...,  0.06008328,
         0.06003164,  0.06069585],
       ..., 
       [ 0.02885882,  0.03674335,  0.0342699 , ...,  0.041047  ,
         0.04069414,  0.04045465],
       [ 0.12234328,  0.13121356,  0.13671204, ...,  0.14277216,
         0.14122162,  0.14296893],
       [ 0.20307179,  0.19592817,  0.21982597, ...,  0.24806663,
         0.24127593,  0.24203833]])

## Error

In [70]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.1274046740866255
Item-based CF RMSE: 3.4535177267392787
