# An ensemble method for top-N recommendations from the SVD

## SVD

In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import pickle
import copy 
import random
import time

time: 343 ms


In [3]:
data_file = pd.read_table('data/training.csv', sep = ',', header=None, engine='python')
print(data_file.shape)
movie_file = pd.read_table('ml-1m/movies.dat', sep = '::', header=None, engine='python')
print(movie_file.shape)

(702811, 4)
(3883, 3)
time: 2.7 s


In [4]:
#movies 3666(gercege karşılık gelen index) alıp 3952(gerçekid) döner, movie_indices 3952 alıp 3666 döner
users = np.unique(data_file[0]) # 1(0.idex) den 6040(6039.index) a kadar
movies = np.unique(movie_file[0]) # 1(0.idex) den 3952(3666.idex) ye kadar

number_of_rows = len(users) #6040
number_of_columns = len(movies) #3667

movie_indices, user_indices = {}, {}
 
for i in range(len(movies)):
    movie_indices[movies[i]] = i # movie_indices[3952] = 3666 x.filmin indisini verir
    
for i in range(len(users)):
    user_indices[users[i]] = i # x.userın indisini verir

time: 17.3 ms


In [5]:
V = sp.lil_matrix((number_of_rows, number_of_columns))
for line in data_file.values:
    u, i , r , t = map(int,line)
    V[user_indices[u], movie_indices[i]] = r # gerçek user ve movie idnin indexini bulup ratingi matrixteki yere atar

time: 4.67 s


In [6]:
u,s, vt = svds(V, k = 32)

time: 26.9 s


In [7]:
s_diag_matrix = np.zeros((s.shape[0], s.shape[0]))

for i in range(s.shape[0]):
    s_diag_matrix[i,i] = s[i]

time: 2.59 ms


In [8]:
X_lr = np.dot(np.dot(u, s_diag_matrix), vt)

time: 135 ms


In [9]:
negcounter = 0
poscounter = 0
for i in range(s.size-1,s.size):
    for factor in vt[i,:]:
        if factor > 0:
            poscounter = poscounter + 1
        else:
            negcounter = negcounter + 1
print(negcounter,poscounter)

(109, 3774)
time: 8.36 ms


In [10]:
X_lr.shape

(6040, 3883)

time: 66.9 ms


##  Model for Top-N Recommendations

In [11]:
class Node:
    def __init__(self,fsize):
        self.itemFactors = np.empty(shape = (fsize,0))
        self.userFactors = np.empty(shape = (fsize,0))
        self.itemList = []
        self.userList = []
        self.factor = None
        self.factors = []
        self.score = 0
        self.left = None
        self.right = None

time: 60.9 ms


In [12]:
def fillLists(V):
    for user in user_indices:
        V.userList.append(user)
    for item in movie_indices:
        V.itemList.append(item)
def restartV():
    V = Node(s.size)
    V.itemFactors = vt
    V.userFactors = u
    fillLists(V)
    factors = []
    factors.extend(range(0,s.size))
    V.factors = factors
    return V
V = restartV()
print(len(V.userList))

6040
time: 107 ms


In [13]:
def printNode(node):
    print("itemFactors :") 
    #print(node.itemFactors)
    print(node.itemFactors.shape)
    print("userFactors :")
    #print(node.userFactors)
    print(node.userFactors.shape)
    print("factor :")
    print(node.factor)
    print("score :")
    print(node.score)
    print("left :")
    print(node.left)
    print("right :")
    print(node.right)
def printTree(node):
    printNode(node)
    if (node.left != None):
        print("LEFT :")
        printTree(node.left)
    if (node.right != None):
        print("RIGHT :")
        printTree(node.right)
def countLeaves(node):
    count = 0
    if (node.left != None):
        if (node.left.itemFactors.shape[1] <= 300):
            count += 1
        else:
            count += countLeaves(node.left)
    else:
        print(node.itemFactors.shape[1])
        print("ERROR")
    if (node.right != None):
        if (node.right.itemFactors.shape[1] <= 300):
            count += 1
        else:
            count += countLeaves(node.right)
    else:
        print(node.itemFactors.shape[1])
        print("ERROR")
    return count

time: 60.4 ms


In [14]:
V.itemFactors.shape

(32, 3883)

time: 70.8 ms


In [15]:
V.userFactors.shape

(6040, 32)

time: 71.4 ms


In [16]:
def splitNode(node, factor):
    #print("splitNode: ", node.factors, factor)
    factorIndex = node.factors.index(factor)
    node.left = Node(len(node.factors))
    node.right = Node(len(node.factors))
    node.left.factors = node.factors
    node.right.factors = node.factors
    left = 0 #flag and counter
    right = 0
    empt = Node(len(node.factors))
    
    node.right.userFactors = np.transpose(node.right.userFactors)
    node.left.userFactors = np.transpose(node.left.userFactors)
    node.right.itemFactors = np.transpose(node.right.itemFactors)
    node.left.itemFactors = np.transpose(node.left.itemFactors)
    
    
    #print("item: ", node.itemFactors.shape, factor, len(node.itemList))
    #print("user: ", node.userFactors.shape, factor, len(node.userList))
    if (np.array_equal(node.itemFactors, empt.itemFactors) == False):   #bos item factor girebilir mi cond dene
    #if (len(node.itemFactors.shape) > 1):   #bos item factor girebilir mi cond dene
        #print(node.itemFactors.shape)
        for i in range(node.itemFactors.shape[1]):
            #print(node.itemFactors[:, i])
            if (node.itemFactors[:, i][factorIndex] >= 0 ):
                if (left == 0):
                    node.left.itemFactors = node.itemFactors[:, i]
                else:
                    node.left.itemFactors = np.vstack((node.left.itemFactors, node.itemFactors[:, i]))
                node.left.itemList.append(node.itemList[i])
                left += 1
            else:
                #continue
                if (right == 0):
                    node.right.itemFactors = node.itemFactors[:, i]
                else:
                    node.right.itemFactors =np.vstack((node.right.itemFactors, node.itemFactors[:, i]))
                node.right.itemList.append(node.itemList[i])
                right += 1
        node.left.itemFactors = np.transpose(node.left.itemFactors)
        node.right.itemFactors = np.transpose(node.right.itemFactors)
    else:
        print('itemFactors not available')
    left = 0
    right = 0
    if (np.array_equal(node.userFactors, empt.userFactors) == False):   #bos user factor girebilme ihtimali dusun
    #if (len(node.userFactors.shape) > 1):   #bos user factor girebilme ihtimali dusun
        #print(node.userFactors.shape)
        for i in range(node.userFactors.shape[0]):
            if (node.userFactors[i, :][factorIndex] >= 0 ):
                if (left == 0):
                    node.left.userFactors = node.userFactors[i, :]
                else:
                    node.left.userFactors = np.vstack((node.left.userFactors, node.userFactors[i, :]))
                node.left.userList.append(node.userList[i])
                left += 1
            else:
                if (right == 0):
                    node.right.userFactors = node.userFactors[i, :]
                else:
                    node.right.userFactors = np.vstack((node.right.userFactors, node.userFactors[i, :]))
                node.right.userList.append(node.userList[i])
                right += 1
    else:
        print('userFactors not available')
    node.score =  computePrecision(node)

time: 148 ms


In [17]:
def findTopN(matrix, N):
    #print(matrix.shape)
    newMatrix = matrix.argsort(axis = 1)
    newMatrix = np.fliplr(newMatrix)
    return newMatrix[:,:N]
    

time: 82.6 ms


In [18]:
precisionAt = 5
mx = findTopN(X_lr, precisionAt)
def computePrecision(node):
    
    diag_matrix = np.zeros((len(node.factors), len(node.factors)))
    for i in range(len(node.factors)):
        diag_matrix[i,i] = s_diag_matrix[node.factors,node.factors][i]
    
    relevantRight = np.dot(np.dot(node.right.userFactors,diag_matrix), node.right.itemFactors)
    relevantLeft = np.dot(np.dot(node.left.userFactors,diag_matrix), node.left.itemFactors)
    topRight = findTopN(relevantRight, precisionAt)
    topLeft = findTopN(relevantLeft, precisionAt)
    #print(topRight.shape)
    #print(topLeft.shape)
    
    samePlace = 0
    inTopN = 0
    for u in range(len(node.left.userList)):
        #print(u)
        #print(node.left.userList[u])
        for i in range(precisionAt):
            #print(node.left.itemList[topLeft[0,:][i]])
            #print(node.left.userList[u])
            #print(mx[node.left.userList[u],:][i].shape)
            ares = movie_indices[node.left.itemList[topLeft[u,:][i]]]
            amx = mx[user_indices[node.left.userList[u]],:]
            if (ares == amx[i]):
                #print("ALLAH")
                #print(node.left.itemList[topLeft[u,:][i]])
                #print(mx[node.left.userList[u],:])
                samePlace += 1
            if ares in amx:
                inTopN += 1
    for u in range(len(node.right.userList)):
        #print(u)
        #print(node.right.userList[u])
        for i in range(precisionAt):
            #print(node.right.itemList[topRight[0,:][i]])
            #print(node.right.userList[u])
            #print(mx[node.right.userList[u],:][i].shape)
            ares = movie_indices[node.right.itemList[topRight[u,:][i]]]
            amx = mx[user_indices[node.right.userList[u]],:]
            if (ares == amx[i]):
                #print(node.right.itemList[topRight[u,:][i]])
                #print(mx[node.right.userList[u],:])
                samePlace += 1
            if ares in amx:
                inTopN += 1
    #print(samePlace)
    #print(inTopN)
    ret = float(inTopN)/(len(node.userList)*precisionAt)
    #ret = float(samePlace)/(len(node.userList)*precisionAt)
    print(ret)
    return ret
    

time: 1.52 s


In [19]:
def buildTree(node, factors):
    #printNode(node)
    print(factors)
    # TODO : user yoksa bolunmeyi durdur
    if (len(node.itemFactors.shape) > 1 and node.itemFactors.shape[1] <= 300):
        print("Threshold value is reached")
        return node
    elif (len(node.itemFactors.shape) <= 1):
        print('one item factor')
        return node
    elif (node.itemFactors.shape[1] == 0):
        print('no item factor')
        return node
    winner = Node(len(node.factors))
    if not factors:
        print('factors empty')
        return node
    for factor in factors:
        node.factor = factor
        splitNode(node,factor)
        if (node.score >= winner.score):
            winner = copy.deepcopy(node) 
    #print(winner.score)
    #print(winner.factor)
    factors.remove(winner.factor)
    node.factor = winner.factor
    lfactors = list(factors)
    rfactors = list(factors)
    node.left = buildTree(winner.left, lfactors)     # TODO : validate
    node.right = buildTree(winner.right, rfactors)   # TODO : validate
    return node
    

time: 23.7 ms


In [20]:
def getFactorGroups(size, p, a):
    groups = []
    for i in range(0,a):
        counterList = np.zeros(len(factors))
        for c in range(0,int(1/p)):
            group = []
            while (len(group) < size * p):
                available = []
                for k in range(len(counterList)):
                    if (counterList[k] < 1 and k not in group):
                        available.append(k)
                factor = available[random.randint(0, len(available)-1)]
                counterList[factor] = counterList[factor] + 1 
                group.append(factor)
            group.sort()
            groups.append(group)
    return groups
            

time: 133 ms


In [21]:
def buildForest(factors, p, a, sl):
    forest = []
    groups = getFactorGroups(len(factors), p, a)
    for group in groups:
        print(group)
        V = Node(len(group))
        V.itemFactors = vt[group,:]
        V.userFactors = u[:,group]
        fillLists(V)
        V.factors = group
        buildTree(V,group)
        forest.append(V)
    return forest

time: 72.1 ms


In [37]:
singleTree = False
forest = []
factors = []
if singleTree:
    V = restartV()
    factors.extend(range(0,s.size))
    forest.append(buildTree(V,factors))
else:
    factors.extend(range(0,s.size))
    forest = buildForest(factors, 0.5, 3, 300)

[0, 1, 2, 3, 4, 8, 11, 14, 15, 16, 17, 18, 21, 24, 27, 31]
[0, 1, 2, 3, 4, 8, 11, 14, 15, 16, 17, 18, 21, 24, 27, 31]
0.421688741722
0.427417218543
0.433940397351
0.436655629139
0.433907284768
0.448741721854
0.423311258278
0.443708609272
0.43201986755
0.441026490066
0.434105960265
0.439006622517
0.420927152318
0.430662251656
0.443509933775
0.547052980132
[0, 1, 2, 3, 4, 8, 11, 14, 15, 16, 17, 18, 21, 24, 27]
0.421688741722
0.427417218543
0.433940397351
0.436655629139
0.433907284768
0.448741721854
0.423311258278
0.443708609272
0.43201986755
0.441026490066
0.434105960265
0.439006622517
0.420927152318
0.430662251656
0.443509933775
[0, 1, 2, 3, 4, 11, 14, 15, 16, 17, 18, 21, 24, 27]
0.273116089613
0.265852002716
0.286897488119
0.281534283775
0.266938221317
0.270196877122
0.278479293958
0.263543788187
0.268431771894
0.287304820095
0.297148676171
0.258587915818
0.282892057026
0.257230142566
[0, 1, 2, 3, 4, 11, 14, 15, 16, 17, 21, 24, 27]
0.158781869688
0.152974504249
0.166147308782
0.1715297

In [23]:
print(len(forest))

1
time: 2.49 ms


In [24]:
for node in forest:
    print(countLeaves(node))

21
time: 80.7 ms


In [25]:
printTree(forest[0])

itemFactors :
(32, 3883)
userFactors :
(6040, 32)
factor :
31
score :
1.0
left :
<__main__.Node instance at 0x7f70d44c4b90>
right :
<__main__.Node instance at 0x7f70d44c4dd0>
LEFT :
itemFactors :
(32, 3774)
userFactors :
(6040, 32)
factor :
28
score :
0.681125827815
left :
<__main__.Node instance at 0x7f70d44c4710>
right :
<__main__.Node instance at 0x7f70d44c4950>
LEFT :
itemFactors :
(32, 882)
userFactors :
(4404, 32)
factor :
29
score :
0.669754768392
left :
<__main__.Node instance at 0x7f70d44c44d0>
right :
<__main__.Node instance at 0x7f70d44c4560>
LEFT :
itemFactors :
(32, 508)
userFactors :
(2436, 32)
factor :
22
score :
0.464860426929
left :
<__main__.Node instance at 0x7f70d44c8560>
right :
<__main__.Node instance at 0x7f70d44c8950>
LEFT :
itemFactors :
(32, 250)
userFactors :
(1039, 32)
factor :
None
score :
0
left :
None
right :
None
RIGHT :
itemFactors :
(32, 258)
userFactors :
(1397, 32)
factor :
None
score :
0
left :
None
right :
None
RIGHT :
itemFactors :
(32, 374)
userF

# # Testing Model

In [26]:
test_file = pd.read_table('data/test.csv', sep = ',', header=None, engine='python')
test_file.shape

(297398, 4)

time: 1.81 s


In [27]:
#movies 3666(gercege karşılık gelen index) alıp 3952(gerçekid) döner, movie_indices 3952 alıp 3666 döner
test_users = np.unique(test_file[0]) # 1(0.idex) den 6040(6039.index) a kadar

test_number_of_rows = len(test_users) #6040

test_user_indices = {}
  
for i in range(len(test_users)):
    test_user_indices[test_users[i]] = i # x.userın indisini verir
print(len(movie_indices))

3883
time: 16.4 ms


In [28]:
test_V = sp.lil_matrix((test_number_of_rows, number_of_columns))
for line in test_file.values:
    test_u, test_i , test_r , test_time = map(int,line)
    if test_i in movie_indices:
        #print(test_user_indices[test_u],movie_indices[test_i],r)
        test_V[test_user_indices[test_u], movie_indices[test_i]] = test_r # gerçek user ve movie idnin indexini bulup ratingi matrixteki yere atar
    else:
        print("kekt",test_user_indices[test_u],test_i,r)

time: 2.67 s


In [29]:
def isLeaf(node):
    if len(node.itemFactors.shape) < 2:
        return True
    elif node.itemFactors.shape[1] < 300:
        return True
    elif node.factor == None:
        return True
    return False
def isEmpty(node):
    empt = Node(len(node.factors))
    if np.array_equal(node.itemFactors, empt.itemFactors):
        return True
    return False

time: 9.23 ms


In [30]:
def recommend(index):
    userFactors = u[index]
    topList = []
    for tree in forest:
        node = tree
        while not isLeaf(node):
            if userFactors[node.factor] >= 0:
                if not isEmpty(node.left):
                    node = node.left
                else:
                    break
            else:
                if not isEmpty(node.right):
                    node = node.right
                else:
                    break
        test_diag_matrix = np.zeros((len(node.factors), len(node.factors)))
        for i in range(len(node.factors)):
            test_diag_matrix[i,i] = s_diag_matrix[node.factors,node.factors][i]
        relevant = np.dot(np.dot(userFactors[node.factors],test_diag_matrix), node.itemFactors)
        indexMatrix = relevant.argsort()[::-1]#[:precisionAt]
        watched = movies[np.nonzero(test_V[index,:])[1]]
        for i in indexMatrix:
            if node.itemList[i] in watched:
                topList.append((node.itemList[i],relevant[i]))
    topList.sort(key=lambda x: x[1], reverse=True)
    result = []
    for item in topList:
        if not item[0] in result:
            result.append(item[0])
    return result

time: 161 ms


In [31]:
def computeUserAccuracy(index):
    computedMovies = recommend(index)
    if not computedMovies:
        return 0
    weightedSum = 0
    counter = 0
    if precisionAt > len(computedMovies):
        counter = len(computedMovies) 
    else:
        counter = precisionAt 
    sumWeight = (counter * (counter +1)) /2
    for recommendation in computedMovies:
        if (counter != 0):
            weightedSum = weightedSum + test_V[index, movie_indices[recommendation]] * counter
            counter = counter - 1  
    return float(weightedSum / (sumWeight*5))

time: 58.7 ms


In [38]:
def computeAccuracy():
    empty = 0
    sumUserAccuracy = 0.0
    for user in range(0,test_V.shape[0]):
        userAccuracy = computeUserAccuracy(user)
        if (userAccuracy == 0):
            empty = empty + 1
        sumUserAccuracy = sumUserAccuracy + userAccuracy
        print(userAccuracy)
    print(empty)
    print(float(sumUserAccuracy / (test_V.shape[0] - empty)))

time: 3.88 ms


In [39]:
computeAccuracy()

0.96
0.88
0.693333333333
0.52
0.706666666667
0.773333333333
0.84
0.96
0.906666666667
0.906666666667
0.92
0.766666666667
0.666666666667
0.72
0.586666666667
0.786666666667
0.906666666667
0.88
0.733333333333
0.84
0.733333333333
0.773333333333
0.866666666667
0.813333333333
0.973333333333
0.64
1.0
0.76
0.773333333333
0.773333333333
0.853333333333
0.72
0.92
1.0
0.786666666667
0.946666666667
0.773333333333
0.813333333333
0.973333333333
0.826666666667
0.906666666667
0.893333333333
0.786666666667
0.906666666667
0.946666666667
1.0
0.68
0.72
0.853333333333
0.68
0.853333333333
0.826666666667
0.906666666667
0.76
0.893333333333
1.0
0.56
0.946666666667
0.813333333333
0.8
0.64
0.746666666667
0.893333333333
0.946666666667
0.986666666667
0.6
0.813333333333
0.76
0.826666666667
0.946666666667
0.78
0.706666666667
0.746666666667
0.933333333333
0.653333333333
0.893333333333
0.653333333333
0.733333333333
0.766666666667
0.773333333333
1.0
0.866666666667
0.813333333333
0.84
0.746666666667
0.633333333333
0.85333

In [34]:
computeUserAccuracy(0)

0.94

time: 3.08 ms


In [35]:
recommend(0)

[1022, 1907, 150, 48]

time: 72.6 ms


In [36]:
print(test_V.shape)

(6040, 3883)
time: 61.4 ms
