In [2]:
import numpy as np
from scipy.sparse import *
from sklearn.decomposition import TruncatedSVD

n = 444075
f = np.loadtxt('txTripletsCounts.txt', dtype=int)
M = coo_matrix((f[:,2], (f[:,0], f[:,1])), shape=(n, n))
print f.shape


svd = TruncatedSVD(n_components=11)
X = svd.fit_transform(M)
#print X
print X.shape
print svd.explained_variance_ratio_

(444075, 11)
[ 0.85133231  0.06815755  0.03904212  0.00786361  0.00317405  0.00317026
  0.00298797  0.00268186  0.00244104  0.00158059  0.00137515]
(3348026, 3)


In [7]:
from sklearn.utils.extmath import randomized_svd
from scipy.sparse import *
import numpy as np

n = 444075
f = np.loadtxt('txTripletsCounts.txt', dtype=int)
M = coo_matrix((f[:,2], (f[:,0], f[:,1])), shape=(n, n))

U, S, Vt = randomized_svd(M, n_components=11, n_iter=5)
print U.shape
print S.shape
print Vt.shape
print np.dot(U,Vt)

KeyboardInterrupt: 

In [19]:
import numpy as np
from scipy.sparse import csc_matrix
from sparsesvd import sparsesvd
import math as mt
from scipy.sparse.linalg import * #used for matrix multiplication
from scipy import spatial
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD, IncrementalPCA, SparsePCA
import math
from sklearn.preprocessing import StandardScaler, maxabs_scale

def cos_sim(v1, v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def readTest():
    #fname = 'smallTest.txt'
    fname = 'testTriplets.txt'
    testPairs = []
    testLabels = []
    ftest = np.loadtxt(fname, dtype=int)
    for r in xrange(ftest.shape[0]):
        row = ftest[r,:]
        testPairs.append((row[0],row[1]))
        testLabels.append(row[2])
    return (testPairs, testLabels)

def computeEstimatedRatings(M, U, S, Vt, uTest, n, test):
    rightTerm = S*Vt 

    estimatedRatings = np.zeros(shape=(n, n), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        print "Sparse: " + str(prod.shape)
        print prod
        
        #we convert the vector to dense format in order to get the indices of the movies with the best estimated ratings 
        #estimatedRatings[userTest, :] = prod.todense()
        #print "Dense: " + str(prod.todense().shape)
        #print prod.todense()      

    return estimatedRatings

def computePredictions(M, U, S, Vt, testPairs, n):
    rightTerm = S*Vt
    testPredictions = []
    for pair in testPairs:
        sender = pair[0]
        receiver = pair[1]
        prod = U[sender, :]*rightTerm
        value = prod[:,receiver]
        if value > 0:
            prediction = 1
        else:
            prediction = 0
        testPredictions.append(prediction)
        #print str(sender) + " " + str(receiver) + " " + str(prediction)
    return testPredictions

def computePredictionsSim(M, X, U, S, Vt, testPairs, n):
    print "Computing Predictions"
    rightTerm = S*Vt
    thrsh = 0.5
    testPredictions = []
    
    for pair in testPairs:
        #print "New pair"
        sender = pair[0]
        receiver = pair[1]
        #prod = U[sender, :]*rightTerm
        prod = X[sender, :]
        col = M.getcol(receiver)
        senders,cols = col.nonzero()
        #col = M[:,receiver].toarray()
        #print len(senders)
        
        # get past senders of receiver
        #senders = []
        #for v in col:
        #    if v > 0:
        #        senders.append(np.where(col==v))
        #print len(senders)
        #print senders
        
        count = 0
        if len(senders) > 10:
            senders = senders[:10]
        #print len(senders)
        for s in senders:
            #new_prod = U[s, :]*rightTerm
            new_prod = X[s, :]
            sim = cos_sim(prod, new_prod)
            #sim = 1 - spatial.distance.cosine(prod, new_prod)
            #m = np.array([prod, new_prod])
            #print m
            #sim = metrics.pairwise.cosine_similarity(m)
            #print sim
            if sim > thrsh:
                count = count + 1
        
        if count > 0: testPredictions.append(1)
        else: testPredictions.append(0)
    return testPredictions         
        
def computeSVD(urm, K):
    U, s, Vt = sparsesvd(urm, K)

    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(np.transpose(U), dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    #print U.shape
    #print S.shape
    #print Vt.shape

    return U, S, Vt

def computeAccuracy(testLabels, predictions):
    print "Computing Accuracy"
    check = []
    for T,t in zip(testLabels,predictions):
        if T == t: check.append(1)
        else: check.append(0)
    num_correct = sum(check)
    accuracy = np.true_divide(num_correct, len(check))
    return accuracy

def main():
    #n = 11
    #train_name = 'smallTrain.txt'
    n = 444075
    train_name = 'txTripletsCounts.txt'
    
    print "Read in data"
    f = np.loadtxt(train_name, dtype=int)
    M = csc_matrix((f[:,2], (f[:,0], f[:,1])), shape=(n, n))
    
    print "Compute SVD"
    K = 1
    U, S, Vt = computeSVD(M, K)
    
    print "Read in the test pairs"
    testPairs, testLabels = readTest()
    
    print "Compute X"
    svd = TruncatedSVD(n_components=5)
    X = svd.fit_transform(M)
    predictions = computePredictionsSim(M, X, U, S, Vt, testPairs, n)
    print computeAccuracy(testLabels, predictions)
    
    print "Compute X_std"
    M_std = maxabs_scale(M)
    svd = TruncatedSVD(n_components=5)
    X_std = svd.fit_transform(M_std)
    predictions = computePredictionsSim(M, X_std, U, S, Vt, testPairs, n)
    print computeAccuracy(testLabels, predictions)

main()


Read in data
Compute SVD
Read in the test pairs
Compute X
Computing Predictions




Computing Accuracy
0.4385
Compute X_std
Computing Predictions
Computing Accuracy
0.2324
