# An ensemble method for top-N recommendations from the SVD

## SVD

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import pickle
import copy 
import random

In [None]:
data_file = pd.read_table('data/training.csv', sep = ',', header=None, engine='python')
print(data_file.shape)
movie_file = pd.read_table('ml-1m/movies.dat', sep = '::', header=None, engine='python')
print(movie_file.shape)

In [None]:
#movies 3666(gercege karşılık gelen index) alıp 3952(gerçekid) döner, movie_indices 3952 alıp 3666 döner
users = np.unique(data_file[0]) # 1(0.idex) den 6040(6039.index) a kadar
movies = np.unique(movie_file[0]) # 1(0.idex) den 3952(3666.idex) ye kadar

number_of_rows = len(users) #6040
number_of_columns = len(movies) #3667

movie_indices, user_indices = {}, {}
 
for i in range(len(movies)):
    movie_indices[movies[i]] = i # movie_indices[3952] = 3666 x.filmin indisini verir
    
for i in range(len(users)):
    user_indices[users[i]] = i # x.userın indisini verir

In [None]:
V = sp.lil_matrix((number_of_rows, number_of_columns))
for line in data_file.values:
    u, i , r , gona = map(int,line)
    V[user_indices[u], movie_indices[i]] = r # gerçek user ve movie idnin indexini bulup ratingi matrixteki yere atar

In [None]:
print(users)
print(user_indices[6000])
print(movies)
print(movie_indices[3952])
#print (V[:3,:])
print(movies.shape)

In [None]:
print(movies[253])
print(movies[20])
print(movies[0])
print(movies)

In [None]:
u,s, vt = svds(V, k = 16)

In [None]:
s_diag_matrix = np.zeros((s.shape[0], s.shape[0]))

for i in range(s.shape[0]):
    s_diag_matrix[i,i] = s[i]

In [None]:
X_lr = np.dot(np.dot(u, s_diag_matrix), vt)

In [None]:
negcounter = 0
poscounter = 0
for i in range(s.size-1,s.size):
    for factor in vt[i,:]:
        if factor > 0:
            poscounter = poscounter + 1
        else:
            negcounter = negcounter + 1
print(negcounter,poscounter)

In [None]:
X_lr[0,2354]

In [None]:
#X_lr.tofile(file = 'data/svdresults.csv', sep = "::")

In [None]:
print(u.shape)
deneme = np.dot(u, vt)


In [None]:
print(X_lr.shape, V.shape, deneme.shape)

In [None]:
print(user_indices[641],movie_indices[1210],user_indices[4388],movie_indices[3270],user_indices[1448],movie_indices[3270])
print(user_indices[641],movie_indices[2424],user_indices[1280],movie_indices[3826],user_indices[4034],movie_indices[3578])
print(user_indices[4779],movie_indices[1237],user_indices[588],movie_indices[1288],user_indices[4578],movie_indices[1120])
print(user_indices[1100],movie_indices[3159],user_indices[4604],movie_indices[2003],user_indices[4578],movie_indices[1120])

In [None]:

print(V[1279, 3546],X_lr[1279, 3546],deneme[1279, 3546])
print(V[1099, 2916],X_lr[1099, 2916],deneme[1099, 2916])

print(V[4603, 1801],X_lr[4603, 1801],deneme[4603, 1801])

print(V[4033, 3305],X_lr[4033, 3305],deneme[4033, 3305])
print(V[640, 1108],X_lr[640, 1108],deneme[640, 1108])
print(V[4387, 3017],X_lr[4387, 3017],deneme[4387, 3017])

print(V[640, 2206],X_lr[640, 2206],deneme[640, 2206])
print(V[587, 1184],X_lr[587, 1184],deneme[587, 1184])
print(V[4577, 1033],X_lr[4577, 1033],deneme[4577, 1033])

print(V[1447, 2760],X_lr[1447, 2760],deneme[1447, 2760])
print(V[4778, 1134],X_lr[4778, 1134],deneme[4778, 1134])


In [None]:
print(np.min(X_lr))

##  Model for Top-N Recommendations

In [None]:
class Node:
    def __init__(self,fsize):
        self.itemFactors = np.empty(shape = (fsize,0))
        self.userFactors = np.empty(shape = (fsize,0))
        self.itemList = []
        self.userList = []
        self.factor = None
        self.factors = []
        self.score = 0
        self.left = None
        self.right = None

In [None]:
def fillLists(V):
    for user in user_indices:
        V.userList.append(user-1)
    for item in movie_indices:
        V.itemList.append(item-1)
def restartV():
    V = Node(s.size)
    V.itemFactors = vt
    V.userFactors = u
    fillLists(V)
    factors = []
    factors.extend(range(0,s.size))
    V.factors = factors
    return V
V = restartV()
print(len(V.userList))

In [None]:
def printNode(node):
    print("itemFactors :") 
    #print(node.itemFactors)
    print(node.itemFactors.shape)
    print("userFactors :")
    #print(node.userFactors)
    print(node.userFactors.shape)
    print("factor :")
    print(node.factor)
    print("score :")
    print(node.score)
    print("left :")
    print(node.left)
    print("right :")
    print(node.right)
def printTree(node):
    printNode(node)
    if (node.left != None):
        print("LEFT :")
        printTree(node.left)
    if (node.right != None):
        print("RIGHT :")
        printTree(node.right)
def countLeaves(node):
    count = 0
    if (node.left != None):
        if (node.left.itemFactors.shape[1] <= 300):
            count += 1
        else:
            count += countLeaves(node.left)
    else:
        print(node.itemFactors.shape[1])
        print("ERROR")
    if (node.right != None):
        if (node.right.itemFactors.shape[1] <= 300):
            count += 1
        else:
            count += countLeaves(node.right)
    else:
        print(node.itemFactors.shape[1])
        print("ERROR")
    return count

In [None]:
V.itemFactors.shape

In [None]:
V.userFactors.shape

In [None]:
def splitNode(node, factor):
    #print("splitNode: ", node.factors, factor)
    factorIndex = node.factors.index(factor)
    node.left = Node(len(node.factors))
    node.right = Node(len(node.factors))
    node.left.factors = node.factors
    node.right.factors = node.factors
    left = 0 #flag and counter
    right = 0
    empt = Node(len(node.factors))
    
    node.right.userFactors = np.transpose(node.right.userFactors)
    node.left.userFactors = np.transpose(node.left.userFactors)
    node.right.itemFactors = np.transpose(node.right.itemFactors)
    node.left.itemFactors = np.transpose(node.left.itemFactors)
    
    
    #print("item: ", node.itemFactors.shape, factor, len(node.itemList))
    #print("user: ", node.userFactors.shape, factor, len(node.userList))
    if (np.array_equal(node.itemFactors, empt.itemFactors) == False):   #bos item factor girebilir mi cond dene
    #if (len(node.itemFactors.shape) > 1):   #bos item factor girebilir mi cond dene
        #print(node.itemFactors.shape)
        for i in range(node.itemFactors.shape[1]):
            #print(node.itemFactors[:, i])
            if (node.itemFactors[:, i][factorIndex] >= 0 ):
                if (left == 0):
                    node.left.itemFactors = node.itemFactors[:, i]
                else:
                    node.left.itemFactors = np.vstack((node.left.itemFactors, node.itemFactors[:, i]))
                node.left.itemList.append(node.itemList[i])
                left += 1
            else:
                #continue
                if (right == 0):
                    node.right.itemFactors = node.itemFactors[:, i]
                else:
                    node.right.itemFactors =np.vstack((node.right.itemFactors, node.itemFactors[:, i]))
                node.right.itemList.append(node.itemList[i])
                right += 1
        node.left.itemFactors = np.transpose(node.left.itemFactors)
        node.right.itemFactors = np.transpose(node.right.itemFactors)
    else:
        print('itemFactors not available')
    left = 0
    right = 0
    if (np.array_equal(node.userFactors, empt.userFactors) == False):   #bos user factor girebilme ihtimali dusun
    #if (len(node.userFactors.shape) > 1):   #bos user factor girebilme ihtimali dusun
        #print(node.userFactors.shape)
        for i in range(node.userFactors.shape[0]):
            if (node.userFactors[i, :][factorIndex] >= 0 ):
                if (left == 0):
                    node.left.userFactors = node.userFactors[i, :]
                else:
                    node.left.userFactors = np.vstack((node.left.userFactors, node.userFactors[i, :]))
                node.left.userList.append(node.userList[i])
                left += 1
            else:
                if (right == 0):
                    node.right.userFactors = node.userFactors[i, :]
                else:
                    node.right.userFactors = np.vstack((node.right.userFactors, node.userFactors[i, :]))
                node.right.userList.append(node.userList[i])
                right += 1
    else:
        print('userFactors not available')
    node.score =  computePrecision(node)

In [None]:
def findTopN(matrix, N):
    #print(matrix.shape)
    newMatrix = matrix.argsort(axis = 1)
    newMatrix = np.fliplr(newMatrix)
    return newMatrix[:,:N]
    

In [None]:
mx = findTopN(X_lr, 5)
print(mx)
print(mx.shape)
print(mx[6039,:])

In [None]:
precisionAt = 5
def computePrecision(node):
    
    diag_matrix = np.zeros((len(node.factors), len(node.factors)))
    for i in range(len(node.factors)):
        diag_matrix[i,i] = s_diag_matrix[node.factors,node.factors][i]
    
    relevantRight = np.dot(np.dot(node.right.userFactors,diag_matrix), node.right.itemFactors)
    relevantLeft = np.dot(np.dot(node.left.userFactors,diag_matrix), node.left.itemFactors)
    topRight = findTopN(relevantRight, precisionAt)
    topLeft = findTopN(relevantLeft, precisionAt)
    #print(topRight.shape)
    #print(topLeft.shape)
    
    samePlace = 0
    inTopN = 0
    for u in range(len(node.left.userList)):
        #print(u)
        #print(node.left.userList[u])
        for i in range(precisionAt):
            #print(node.left.itemList[topLeft[0,:][i]])
            if (node.left.itemList[topLeft[u,:][i]] == mx[node.left.userList[u],:][i]):
                #print("ALLAH")
                #print(node.left.itemList[topLeft[u,:][i]])
                #print(mx[node.left.userList[u],:])
                samePlace += 1
            if node.left.itemList[topLeft[u,:][i]] in mx[node.left.userList[u],:]:
                inTopN += 1
    for u in range(len(node.right.userList)):
        #print(u)
        #print(node.right.userList[u])
        for i in range(precisionAt):
            #print(node.right.itemList[topRight[0,:][i]])
            if (node.right.itemList[topRight[u,:][i]] == mx[node.right.userList[u],:][i]):
                #print(node.right.itemList[topRight[u,:][i]])
                #print(mx[node.right.userList[u],:])
                samePlace += 1
            if node.right.itemList[topRight[u,:][i]] in mx[node.right.userList[u],:]:
                inTopN += 1
    #print(samePlace)
    #print(inTopN)
    ret = float(inTopN)/(len(node.userList)*precisionAt)
    #ret = float(samePlace)/(len(node.userList)*precisionAt)
    print(ret)
    return ret
    

In [None]:
splitNode(V, 15)

In [None]:
def buildTree(node, factors):
    #printNode(node)
    print(factors)
    # TODO : user yoksa bolunmeyi durdur
    if (len(node.itemFactors.shape) > 1 and node.itemFactors.shape[1] <= 300):
        print("Threshold value is reached")
        return node
    elif (len(node.itemFactors.shape) <= 1):
        print('one item factor')
        return node
    elif (node.itemFactors.shape[1] == 0):
        print('no item factor')
        return node
    winner = Node(len(node.factors))
    if not factors:
        print('factors empty')
        return node
    for factor in factors:
        node.factor = factor
        splitNode(node,factor)
        if (node.score >= winner.score):
            winner = copy.deepcopy(node) 
    #print(winner.score)
    #print(winner.factor)
    factors.remove(winner.factor)
    node.factor = winner.factor
    lfactors = list(factors)
    rfactors = list(factors)
    node.left = buildTree(winner.left, lfactors)     # TODO : validate
    node.right = buildTree(winner.right, rfactors)   # TODO : validate
    return node
    

In [None]:
print(V.itemFactors.shape)
print(V.userFactors.shape)

In [None]:
splitNode(V,0)
print(V.right.itemFactors.shape)
print(V.left.itemFactors.shape)
print(V.right.userFactors.shape)
print(V.left.userFactors.shape)
printNode(V)

In [None]:
V = restartV()
factors = []
factors.extend(range(0,s.size))
V = buildTree(V,factors)

In [None]:
print("ROOT :")
printTree(V)

In [None]:
countLeaves(V)

In [None]:
def getFactorGroups(size, p, a):
    groups = []
    for i in range(0,a):
        counterList = np.zeros(len(factors))
        for c in range(0,int(1/p)):
            group = []
            while (len(group) < size * p):
                available = []
                for k in range(len(counterList)):
                    if (counterList[k] < 1 and k not in group):
                        available.append(k)
                factor = available[random.randint(0, len(available)-1)]
                counterList[factor] = counterList[factor] + 1 
                group.append(factor)
            group.sort()
            groups.append(group)
    return groups
            

In [None]:
def buildForest(factors, p, a, sl):
    forest = []
    groups = getFactorGroups(len(factors), p, a)
    for group in groups:
        print(group)
        V = Node(len(group))
        V.itemFactors = vt[group,:]
        V.userFactors = u[:,group]
        fillLists(V)
        V.factors = group
        buildTree(V,group)
        forest.append(V)
    return forest

In [None]:
factors = []
factors.extend(range(0,16))
forest = buildForest(factors, 0.5, 3, 300)

In [None]:
print(forest)

In [None]:
for node in forest:
    print(countLeaves(node))

In [None]:
printTree(forest[3])

# # Testing Model

In [None]:
test_file = pd.read_table('data/test.csv', sep = ',', header=None, engine='python')
test_file.shape

In [None]:
#movies 3666(gercege karşılık gelen index) alıp 3952(gerçekid) döner, movie_indices 3952 alıp 3666 döner
test_users = np.unique(test_file[0]) # 1(0.idex) den 6040(6039.index) a kadar
#test_movies = np.unique(test_file[1]) # 1(0.idex) den 3952(3666.idex) ye kadar

test_number_of_rows = len(test_users) #6040
#test_number_of_columns = len(test_movies) #3667

#test_movie_indices = {}
test_user_indices = {}
 
#for i in range(len(test_movies)):
#    test_movie_indices[test_movies[i]] = i # movie_indices[3952] = 3666 x.filmin indisini verir
    
for i in range(len(test_users)):
    test_user_indices[test_users[i]] = i # x.userın indisini verir
print(len(movie_indices))

In [None]:
test_V = sp.lil_matrix((test_number_of_rows, number_of_columns))
for line in test_file.values:
    test_u, test_i , test_r , test_gona = map(int,line)
    if test_i in movie_indices:
        print(test_user_indices[test_u],movie_indices[test_i],r)
        test_V[test_user_indices[test_u], movie_indices[test_i]] = test_r # gerçek user ve movie idnin indexini bulup ratingi matrixteki yere atar
    else:
        print("kekt",test_user_indices[test_u],test_i,r)

In [None]:
def isLeaf(node):
    #print(node.itemFactors.shape)
    if len(node.itemFactors.shape) < 2:
        print("error")
    elif node.itemFactors.shape[1] < 300:
        #print("zaa")
        return True
    elif node.factor == None:
        return True
    return False
def isEmpty(node):
    empt = Node(len(node.factors))
    if np.array_equal(node.itemFactors, empt.itemFactors):
        return True
    return False

In [None]:
def recommend(index):
    userFactors = u[index]
    topIndexList = []
    topList = []
    counter = 0
    for tree in forest:
        node = tree
        while not isLeaf(node):
            if userFactors[node.factor] >= 0:
                if not isEmpty(node.left):
                    node = node.left
                else:
                    break
            else:
                if not isEmpty(node.right):
                    node = node.right
                else:
                    break
        
        test_diag_matrix = np.zeros((len(node.factors), len(node.factors)))
        for i in range(len(node.factors)):
            test_diag_matrix[i,i] = s_diag_matrix[node.factors,node.factors][i]
        
        relevant = np.dot(np.dot(userFactors[node.factors],test_diag_matrix), node.itemFactors)
        indexMatrix = relevant.argsort()[::-1]#[:precisionAt]
        indexList = []
        nodeIndexList = []
        dictList = []
        for i in indexMatrix:
            if node.itemList[i] in movies[np.nonzero(test_V[index,:])[1]]:
                indexList.append(node.itemList[i])
                nodeIndexList.append(i)
                #dictList.append({node.itemList[i]:relevant[i]})
                dictList.append((node.itemList[i],relevant[i]))
            #print(node.itemList[i], i)
            #indexList.append(node.itemList[i])
        dotMatrix = relevant[nodeIndexList]
        for tupl in dictList:
            topList.append(tupl)
        topIndexList.append(indexList)
        counter += 1
    topList.sort(key=lambda x: x[1], reverse=True)
    result = []
    for item in topList:
        if not item[0] in result:
            result.append(item[0])
    return result

In [None]:
print(np.nonzero(test_V[1,:])[1])
#print(movies[np.nonzero(test_V[0,:])

In [None]:
result = recommend(1)
print(result)

In [None]:
print(u[1])

In [None]:
def computeUserAccuracy(index):
    computedMovies = recommend(index)
    if not computedMovies:
        return 0
    weightedSum = 0
    weights = []
    if precisionAt > len(computedMovies):
        weights.extend(range(1,len(computedMovies) + 1))
    else:
        weights.extend(range(1,precisionAt + 1))
    counter = weights[-1]
    sumWeight = (counter * (counter +1)) /2
    for recommendation in computedMovies:
        if (counter != 0):
            weightedSum = weightedSum + test_V[index, movie_indices[recommendation]] * counter
            counter = counter - 1  
    return float(weightedSum / (sumWeight*5))

In [None]:
def computeAccuracy():
    empty = 0
    sumUserAccuracy = 0.0
    for user in range(0,test_V.shape[0]):
        userAccuracy = computeUserAccuracy(user)
        if (userAccuracy == 0):
            empty = empty + 1
        sumUserAccuracy = sumUserAccuracy + userAccuracy
        print(userAccuracy)
    print(empty)
    print(float(sumUserAccuracy / (test_V.shape[0] - empty)))
        

In [None]:
computeAccuracy()

In [None]:
print(movies[np.nonzero(test_V[1,:])[1]])
print(test_V[1,np.nonzero(test_V[1,:])[1]])

