# An ensemble method for top-N recommendations from the SVD

## SVD

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import pickle

In [2]:
data_file = pd.read_table('data/training.csv', sep = ',', header=None, engine='python')
data_file.shape

(700146, 4)

In [3]:
users = np.unique(data_file[0])
movies = np.unique(data_file[1])
 
number_of_rows = len(users)
number_of_columns = len(movies)

movie_indices, user_indices = {}, {}
 
for i in range(len(movies)):
    movie_indices[movies[i]] = i
    
for i in range(len(users)):
    user_indices[users[i]] = i

In [4]:
V = sp.lil_matrix((number_of_rows, number_of_columns))
for line in data_file.values:
    u, i , r , gona = map(int,line)
    V[user_indices[u], movie_indices[i]] = r

In [5]:
u,s, vt = svds(V, k = 32)

In [6]:
s_diag_matrix = np.zeros((s.shape[0], s.shape[0]))

for i in range(s.shape[0]):
    s_diag_matrix[i,i] = s[i]

In [7]:
X_lr = np.dot(np.dot(u, s_diag_matrix), vt)

In [8]:
negcounter = 0
poscounter = 0
for i in range(0,32):
    for factor in vt[i,:]:
        if factor > 0:
            poscounter = poscounter + 1
        else:
            negcounter = negcounter + 1
print(negcounter,poscounter)

(60086, 57258)


In [9]:
X_lr[0,2354]

0.0011746514999285325

In [10]:
#X_lr.tofile(file = 'data/svdresults.csv', sep = "::")

In [11]:
X_lr.argmax(axis=1)

array([1826, 2626,  253, ..., 1108,  793, 2626])

In [12]:
mx = X_lr.argsort(axis=1)
mx = np.fliplr(mx)
print mx[:,:5]

[[1826  511  253 1094 2873]
 [2626 2349 1826  106  572]
 [ 253 1095 1094 2626 1108]
 ..., 
 [1108 1047 1161    0 1054]
 [ 793  838 1047 1095 1877]
 [2626  576  590  793 1014]]


In [13]:
X_lr.shape

(6040, 3667)

## Ensemble Method for Top-N Recommendations

In [14]:
class Node:
    def __init__(self):
        self.itemFactors = np.empty(shape = (32,0))
        self.userFactors = np.empty(shape = (32,0))
        self.factor = None
        self.score = 0
        self.left = None
        self.right = None

In [15]:
def printNode(node):
    print("itemFactors :") 
    #print(node.itemFactors)
    print(node.itemFactors.shape)
    print("userFactors :")
    #print(node.userFactors)
    print(node.userFactors.shape)
    print("factor :")
    print(node.factor)
    print("score :")
    print(node.score)
    #print("left :")
    #print(node.left)
    #print("right :")
    #print(node.right)
def printTree(node):
    printNode(node)
    if (node.left != None):
        print("LEFT :")
        printTree(node.left)
    if (node.right != None):
        print("RIGHT :")
        printTree(node.right)
def countLeaves(node):
    count = 0
    if (node.left != None):
        if (node.left.itemFactors.shape[1] <= 300):
            count += 1
        else:
            count += countLeaves(node.left)
    if (node.right != None):
        if (node.right.itemFactors.shape[1] <= 300):
            count += 1
        else:
            count += countLeaves(node.right)
    return count

In [16]:
V = Node()
V.itemFactors = vt
V.userFactors = u

In [17]:
V.itemFactors.shape

(32, 3667)

In [18]:
V.userFactors.shape

(6040, 32)

In [19]:
def splitNode(node, factor):
    node.left = Node()
    node.right = Node()
    left = 0 #flag and counter
    right = 0
    empt = Node()
    if (np.array_equal(node.itemFactors, empt.itemFactors) == False):   #bos item factor girebilir mi cond dene
    #if (len(node.itemFactors.shape) > 1):   #bos item factor girebilir mi cond dene
        #print(node.itemFactors.shape)
        for i in range(node.itemFactors.shape[1]):
            #print(node.itemFactors[:, i])
            if (node.itemFactors[:, i][factor] >= 0 ):
                if (left == 0):
                    node.left.itemFactors = node.itemFactors[:, i]
                else:
                    node.left.itemFactors = np.vstack((node.left.itemFactors, node.itemFactors[:, i]))
                left += 1
            else:
                #continue
                if (right == 0):
                    node.right.itemFactors = node.itemFactors[:, i]
                else:
                    node.right.itemFactors =np.vstack((node.right.itemFactors, node.itemFactors[:, i]))
                right += 1
        node.left.itemFactors = np.transpose(node.left.itemFactors)
        node.right.itemFactors = np.transpose(node.right.itemFactors)
    else:
        print('itemFactors not available')
    left = 0
    right = 0
    if (np.array_equal(node.userFactors, empt.userFactors) == False):   #bos user factor girebilme ihtimali dusun
    #if (len(node.userFactors.shape) > 1):   #bos user factor girebilme ihtimali dusun
        #print(node.userFactors.shape)
        for i in range(node.userFactors.shape[0]):
            if (node.userFactors[i, :][factor] >= 0 ):
                if (left == 0):
                    node.left.userFactors = node.userFactors[i, :]
                else:
                    node.left.userFactors = np.vstack((node.left.userFactors, node.userFactors[i, :]))
                left += 1
            else:
                if (right == 0):
                    node.right.userFactors = node.userFactors[i, :]
                else:
                    node.right.userFactors = np.vstack((node.right.userFactors, node.userFactors[i, :]))
                right += 1
    else:
        print('userFactors not available')
    #node.score =  computePrecision(node)

In [20]:
def computePrecision(node):
    
    

IndentationError: expected an indented block (<ipython-input-20-1e092fe67733>, line 2)

In [None]:
def buildTree(node, factors):
    #printNode(node)
    #print(factors)
    empt = Node()
    if (len(node.itemFactors.shape) > 1 and node.itemFactors.shape[1] <= 300):
        print("Threshold value is reached")
        return node
    elif (len(node.itemFactors.shape) <= 1):
        print('one item factor')
        return node
    elif (node.itemFactors.shape[1] == 0):
        print('no item factor')
        return node
    winner = Node()
    if not factors:
        print('factors empty')
        return node
    for factor in factors:
        node.factor = factor
        splitNode(node,factor)
        if (node.score >= winner.score):
            winner = node
    factors.remove(winner.factor)
    node.factor = winner.factor
    lfactors = list(factors)
    rfactors = list(factors)
    buildTree(winner.left, lfactors)
    buildTree(winner.right, rfactors)
    return node
    

In [None]:
def getFactorGroups(size, p, a):
    print ("hello")

In [None]:
def buildForest(factors, p, a, sl):
    groups = getFactorGroups(len(factors), p, a)
    for g in groups:
        V = Node()
        V.itemFactors = vt
        V.userFactors = u
       # buildTee(V,g)
        forest.append(V)
    return forest

In [None]:
factors = []
factors.extend(range(0,s.size))
getFactorGroups(factors, 0.5, 3)

In [None]:
print(V.itemFactors.shape)
print(V.userFactors.shape)

In [None]:
splitNode(V,0)
print(V.right.itemFactors.shape)
print(V.left.itemFactors.shape)
print(V.right.userFactors.shape)
print(V.left.userFactors.shape)
printNode(V)

In [None]:
factors = []
factors.extend(range(0,s.size))
buildTree(V,factors)

In [None]:
print("ROOT :")
printTree(V)

In [None]:
countLeaves(V)