In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import pickle
import surprise as srp
import time

time: 328 ms


In [3]:
# path to dataset file
#file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
# and split it into 3 folds for cross-validation.
reader = srp.Reader(line_format='user item rating timestamp', sep=',')

data = srp.Dataset.load_from_file('data/training.dat', reader=reader)
data.split(n_folds=3)


time: 1.27 s


In [4]:
# We'll use the famous SVD algorithm.
#algo = srp.SVD(n_factors=16)
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = srp.KNNBaseline(sim_options=sim_options)

for trainset, testset in data.folds():

    # train and test algorithm.
    algo.train(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    rmse = srp.accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8856
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8860
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8881
time: 2min 42s


In [5]:
uid = str(1)  # raw user id (as in the ratings file). They are **strings**!
iid = str(48)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 1          item: 48         r_ui = 4.00   est = 3.34   {u'actual_k': 16, u'was_impossible': False}
time: 2.01 ms


In [6]:
predictions[1]

Prediction(uid='2909', iid='3811', r_ui=4.0, est=4.5549507356041481, details={u'actual_k': 40, u'was_impossible': False})

time: 77.1 ms


In [7]:
pred = algo.predict(uid, iid, r_ui=5, verbose=True)

user: 1          item: 48         r_ui = 5.00   est = 3.34   {u'actual_k': 16, u'was_impossible': False}
time: 161 ms


In [8]:
pred = algo.predict(uid, iid, r_ui=1, verbose=True)

user: 1          item: 48         r_ui = 1.00   est = 3.34   {u'actual_k': 16, u'was_impossible': False}
time: 94.6 ms


In [9]:
pred = algo.predict(uid, iid, r_ui=0, verbose=True)

user: 1          item: 48         r_ui = 0.00   est = 3.34   {u'actual_k': 16, u'was_impossible': False}
time: 50.7 ms


In [10]:
print(predictions[1])

user: 2909       item: 3811       r_ui = 4.00   est = 4.55   {u'actual_k': 40, u'was_impossible': False}
time: 63.6 ms


In [11]:
print(predictions[1][0])

2909
time: 63.3 ms


In [12]:
test_file = pd.read_table('data/test.csv', sep = ',', header=None, engine='python')
print(test_file.shape)
movie_file = pd.read_table('ml-1m/movies.dat', sep = '::', header=None, engine='python')
print(movie_file.shape)

(297398, 4)
(3883, 3)
time: 1.54 s


In [13]:
#movies 3666(gercege karşılık gelen index) alıp 3952(gerçekid) döner, movie_indices 3952 alıp 3666 döner
test_users = np.unique(test_file[0]) # 1(0.idex) den 6040(6039.index) a kadar
movies = np.unique(movie_file[0])

test_number_of_rows = len(test_users) #6040
number_of_columns = len(movies) #3667

movie_indices, test_user_indices = {}, {}
 
for i in range(len(movies)):
    movie_indices[movies[i]] = i # movie_indices[3952] = 3666 x.filmin indisini verir
  
for i in range(len(test_users)):
    test_user_indices[test_users[i]] = i # x.userın indisini verir
print(len(movie_indices))

3883
time: 11.1 ms


In [14]:
test_V = sp.lil_matrix((test_number_of_rows, number_of_columns))
for line in test_file.values:
    u, i , r , t = map(int,line)
    test_V[test_user_indices[u], movie_indices[i]] = r # gerçek user ve movie idnin indexini bulup ratingi matrixteki yere atar

time: 1.89 s


In [15]:
#P = sp.lil_matrix((test_number_of_rows, number_of_columns))
#for user in range(test_number_of_rows):
#    for movie in np.nonzero(test_V[user,:])[1]:
#        pred = algo.predict(str(test_users[user]), str(movies[movie]), r_ui=0, verbose=False)
#        P[user, movie] = pred[3]
#P = P.todense()
#print(P.shape)

time: 933 µs


In [16]:
def predict(index):
    result = sp.lil_matrix((1, number_of_columns))
    for movie in np.nonzero(test_V[index,:])[1]:
        pred = algo.predict(str(test_users[index]), str(movies[movie]), r_ui=0, verbose=False)
        result[0, movie] = pred[3]
    return result.todense()

time: 58.2 ms


In [17]:
def recommend(index):
    P = predict(index)
    indexList = np.nonzero(P[0,:])[1]
    relevant = np.asarray(P[0,indexList])
    #print("indexlist", indexList)
    #print("relevant", relevant)
    indexSort = np.fliplr(relevant.argsort())
    #print("indexsort", indexSort)
    result = []
    for i in indexSort[0]:
        result.append(movies[indexList[i]])
    return result

time: 53.6 ms


In [18]:
result = recommend(0)
print(result)

[527, 2762, 260, 150, 1193, 608, 1097, 2797, 1962, 1022, 1907, 2321, 1961, 2340, 48]
time: 56.2 ms


In [19]:
precisionAt = 5
def computeUserAccuracy(index):
    computedMovies = recommend(index)
    if not computedMovies:
        return 0
    weightedSum = 0
    counter = 0
    if precisionAt > len(computedMovies):
        counter = len(computedMovies) 
    else:
        counter = precisionAt 
    sumWeight = (counter * (counter +1)) /2
    for recommendation in computedMovies:
        if (counter != 0):
            weightedSum = weightedSum + test_V[index, movie_indices[recommendation]] * counter
            counter = counter - 1  
    return float(weightedSum / (sumWeight*5))

time: 87.7 ms


In [20]:
def computeAccuracy():
    empty = 0
    sumUserAccuracy = 0.0
    for user in range(0,test_V.shape[0]):
        userAccuracy = computeUserAccuracy(user)
        if (userAccuracy == 0):
            empty = empty + 1
        sumUserAccuracy = sumUserAccuracy + userAccuracy
        print(userAccuracy)
    print(empty)
    print(float(sumUserAccuracy / (test_V.shape[0] - empty)))

time: 109 ms


In [21]:
computeAccuracy()

0.906666666667
0.933333333333
0.906666666667
0.706666666667
0.72
0.893333333333
0.96
0.986666666667
0.946666666667
0.92
0.893333333333
0.68
0.76
0.733333333333
0.8
0.826666666667
0.933333333333
1.0
0.76
0.88
0.84
0.826666666667
0.786666666667
0.92
0.8
0.693333333333
0.96
0.88
0.746666666667
0.786666666667
0.906666666667
0.76
0.92
0.933333333333
0.706666666667
0.893333333333
0.826666666667
0.906666666667
1.0
0.853333333333
0.773333333333
0.786666666667
0.853333333333
0.933333333333
0.533333333333
1.0
0.746666666667
0.826666666667
0.933333333333
0.76
0.906666666667
0.813333333333
1.0
0.8
1.0
0.92
0.866666666667
0.933333333333
0.76
0.386666666667
0.613333333333
0.84
0.933333333333
0.973333333333
0.986666666667
0.88
0.893333333333
0.853333333333
0.986666666667
0.84
0.906666666667
0.706666666667
0.973333333333
1.0
0.866666666667
0.96
0.693333333333
0.8
0.88
0.893333333333
1.0
0.973333333333
0.866666666667
0.906666666667
0.546666666667
0.84
0.84
0.933333333333
0.506666666667
0.906666666667
1

In [22]:
computeUserAccuracy(0)

0.9066666666666666

time: 8.31 ms
