## euclidean

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
import nltk

from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import cross_validation as cv
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import re
import string
import operator
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import mean_squared_error



In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
ratings_music_df = pd.read_csv('data/ratings_Digital_Music.csv', names=header)
ratings_music_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,A2EFCYXHNK06IS,5555991584,5.0,978480000
1,A1WR23ER5HMAA9,5555991584,5.0,953424000
2,A2IR4Q0GPAFJKW,5555991584,4.0,1393545600
3,A2V0KUVAB9HSYO,5555991584,4.0,966124800
4,A1J0GL9HCA7ELW,5555991584,5.0,1007683200


In [4]:
ratings_music_df_10000 = ratings_music_df.head(10000)

In [5]:
train_data, test_data = cv.train_test_split(ratings_music_df_10000, test_size=0.25)

In [6]:
train_data_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
test_data_matrix = test_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

In [14]:
user_similarity = pairwise_distances(train_data_matrix, n_jobs=-1, metric='manhattan')
item_similarity = pairwise_distances(train_data_matrix.T, n_jobs=-1, metric='manhattan')

In [15]:
user_similarity

array([[ 0.,  5.,  6., ...,  6.,  5.,  6.],
       [ 5.,  0.,  9., ...,  9.,  8.,  9.],
       [ 6.,  9.,  0., ..., 10.,  9., 10.],
       ...,
       [ 6.,  9., 10., ...,  0.,  9., 10.],
       [ 5.,  8.,  9., ...,  9.,  0.,  9.],
       [ 6.,  9., 10., ..., 10.,  9.,  0.]])

In [16]:
item_similarity

array([[  0., 451., 508., ..., 678., 479., 475.],
       [451.,   0.,  67., ..., 243.,  38.,  34.],
       [508.,  67.,   0., ..., 300.,  95.,  91.],
       ...,
       [678., 243., 300., ...,   0., 271., 267.],
       [479.,  38.,  95., ..., 271.,   0.,  62.],
       [475.,  34.,  91., ..., 267.,  62.,   0.]])

In [17]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        #ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        ratings_diff = ratings.sub(ratings.mean(axis=1), axis=0)
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity).div(pd.DataFrame(np.array([np.abs(similarity).sum(axis=1)])).iloc[0]).values
    return pred

In [18]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [19]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(ground_truth, prediction))

def mse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return mean_squared_error(ground_truth, prediction)

In [20]:
print ('User-based CF MSE: ' , str(mse(user_prediction, test_data_matrix.values)))
print ('Item-based CF MSE: ' , str(mse(item_prediction, test_data_matrix.values)))

User-based CF MSE:  21.14921360917411
Item-based CF MSE:  21.077187303445243
