### Importing Require packages

In [61]:
from __future__ import (absolute_import, division, print_function,unicode_literals)                                      
import pickle
import os
import pandas as pd
from surprise import KNNBasic, KNNWithMeans
from surprise import Dataset                                                     
from surprise import Reader                                                      
from surprise import dump
from surprise.accuracy import rmse
from collections import Counter
import plotly.offline as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()

### Alternate reading way
'''
#Define the format
#reader = Reader(line_format='user item rating timestamp', sep='\t')

#Load the data from the file using the reader format#
#data = Dataset.load_from_file('./ml-100k/u.data', reader=reader)

#Split data into 5 folds 
#training is done on all folds except one and results scoring is done on the remaining fold
#data.split(n_folds=5)
'''

### Load a dataset where folds (for cross-validation) are predifined by some files.

In [63]:
Dataset.load_builtin('ml-100k')
train_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.base'
test_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.test'

data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))

### Item-Item Similarity Model :algo

In [32]:
algo = KNNBasic(k=40, min_k=1,sim_options={'user_based': False, 'name': 'cosine'})  

### User-USer Similarity Model :  algo1

In [33]:
algo1 = KNNWithMeans(k=40, min_k=1,sim_options={'user_based': True, 'name': 'cosine'})  

In [34]:
for trainset, testset in data.folds(): 
    algo.train(trainset)
    algo1.train(trainset)
    predictions = algo.test(testset)
    predictions1 = algo1.test(testset)
    rmse(predictions)
    rmse(predictions1)
    
                                                                               
    dump('./dump_file', predictions, trainset, algo)                           
    dump('./dump_file1', predictions1, trainset, algo1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0491
RMSE: 0.9703
The dump has been saved as file ./dump_file
The dump has been saved as file ./dump_file1


### Loading the prediction from dump file 

In [35]:
# The dump has been saved and we can now use it whenever we want.
# Let's load it and see what we can do
dump_obj = pickle.load(open('./dump_file', 'rb'))
dump_obj1 = pickle.load(open('./dump_file1', 'rb'))

In [36]:
predictions = dump_obj['predictions']
trainset = dump_obj['trainset']
algo = dump_obj['algo']
print('algo: {0}, k = {1}, min_k = {2}'.format(algo['name'], algo['k'], algo['min_k']))

algo: KNNBasic, k = 40, min_k = 1


In [37]:
predictions1 = dump_obj1['predictions']
trainset1 = dump_obj1['trainset']
algo1 = dump_obj1['algo']

### Creating DataFrame with all prediction

In [38]:
# Let's build a pandas dataframe with all the predictions
### ur : The users ratings. This is a dictionary containing lists of tuples of the form 
### (item_inner_id,rating). The keys are user inner ids.
### ir : The items ratings. This is a dictionary containing lists of tuples of the form 
### (user_inner_id,rating). The keys are item inner ids.
def get_Iu(uid):
    '''
    Returns:
        The number of items rated by the user.
    '''
    
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):

    '''
    Returns:
        The number of users that have rated the item.
    '''
    
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['ItemWatchedByUser'] = df.uid.apply(get_Iu)
df['UserGaveRatingToItem'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

df1 = pd.DataFrame(predictions1, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df1['ItemWatchedByUser'] = df1.uid.apply(get_Iu)
df1['UserGaveRatingToItem'] = df1.iid.apply(get_Ui)
df1['err'] = abs(df1.est - df1.rui)

In [39]:
df.head()

Unnamed: 0,uid,iid,rui,est,details,ItemWatchedByUser,UserGaveRatingToItem,err
0,1,6,5.0,3.440693,"{'actual_k': 40, 'was_impossible': False}",135,20,1.559307
1,1,10,3.0,3.770565,"{'actual_k': 40, 'was_impossible': False}",135,73,0.770565
2,1,12,5.0,3.94531,"{'actual_k': 40, 'was_impossible': False}",135,211,1.05469
3,1,14,5.0,3.768122,"{'actual_k': 40, 'was_impossible': False}",135,140,1.231878
4,1,17,3.0,3.467036,"{'actual_k': 40, 'was_impossible': False}",135,72,0.467036


In [40]:
df1.head()

Unnamed: 0,uid,iid,rui,est,details,ItemWatchedByUser,UserGaveRatingToItem,err
0,1,6,5.0,3.707803,"{'actual_k': 20, 'was_impossible': False}",135,20,1.292197
1,1,10,3.0,3.89072,"{'actual_k': 40, 'was_impossible': False}",135,73,0.89072
2,1,12,5.0,4.378248,"{'actual_k': 40, 'was_impossible': False}",135,211,0.621752
3,1,14,5.0,4.194055,"{'actual_k': 40, 'was_impossible': False}",135,140,0.805945
4,1,17,3.0,3.590459,"{'actual_k': 40, 'was_impossible': False}",135,72,0.590459


### Finding best and worst prediction

In [41]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

best_predictions1 = df1.sort_values(by='err')[:10]
worst_predictions1 = df1.sort_values(by='err')[-10:]

In [51]:
worst_predictions1

Unnamed: 0,uid,iid,rui,est,details,ItemWatchedByUser,UserGaveRatingToItem,err
9514,212,180,1.0,4.470945,"{'actual_k': 40, 'was_impossible': False}",17,179,3.470945
1087,14,176,1.0,4.476108,"{'actual_k': 40, 'was_impossible': False}",41,226,3.476108
10803,239,286,1.0,4.49455,"{'actual_k': 40, 'was_impossible': False}",86,388,3.49455
15290,312,157,1.0,4.526946,"{'actual_k': 40, 'was_impossible': False}",130,99,3.526946
7861,181,25,5.0,1.375499,"{'actual_k': 40, 'was_impossible': False}",218,231,3.624501
7390,167,169,1.0,4.672927,"{'actual_k': 40, 'was_impossible': False}",38,97,3.672927
13972,295,183,1.0,4.680678,"{'actual_k': 40, 'was_impossible': False}",100,223,3.680678
15306,312,265,1.0,4.730264,"{'actual_k': 40, 'was_impossible': False}",130,169,3.730264
15286,312,144,1.0,4.859269,"{'actual_k': 40, 'was_impossible': False}",130,195,3.859269
19140,405,575,5.0,1.0,"{'actual_k': 36, 'was_impossible': False}",582,36,4.0


### Storing various rating provided by different user for movie with itemid

In [53]:
itemid='180'
counter = Counter([r for (_, r) in trainset.ir[trainset.to_inner_iid(itemid)]])
x=[]
y=[]
for key, value in counter.items():
    x.append(key)
    y.append(value)

### Plotting bar graph of rating distribution for movie data 
To look for possible explanation for poor prediction

In [54]:
data = [go.Bar(x=x,y=y)]

layout = go.Layout(
    title='Frequency of different rating by user for item {}'.format(itemid),
    xaxis=dict(
        title='Rating value'
    ),
    yaxis=dict(
        title='Number of users'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='ratingDiffUSer')

### Histogram of error for Model - item item

In [55]:
data = [go.Histogram(x=df.est - df.rui)]

layout = go.Layout(
    title='Error Distribution',
    xaxis=dict(
        title='Error Value'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='error-hist')

In [67]:
data = [go.Box(y=df.est - df.rui)]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='error-box')

### Histogram of error for Model1 - user-user

In [57]:
data = [go.Histogram(x=df1.est - df1.rui)]

layout = go.Layout(
    title='Error Distribution',
    xaxis=dict(
        title='Error Value'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='error-hist1')

In [68]:
data = [go.Box(y=df1.est - df1.rui)]

layout = go.Layout(
    title='Error Distribution',
    xaxis=dict(
        title='Error Value'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='error-box1')