# nzhiltsov/Ext-RESCAL

`…efactored`
1 parent f8ac2cf commit e46777963887db680487491f1b0dd88ba6b84fba committed Mar 13, 2013
Showing with 71 additions and 55 deletions.
1. +16 −1 commonFunctions.py
2. +7 −1 commonFunctionsTest.py
3. +16 −12 extrescal.py
4. +0 −17 extrescalFunctions.py
5. +1 −6 extrescalFunctionsTest.py
6. +31 −18 rescal.py
 @@ -1,6 +1,6 @@ from numpy import dot from numpy.random import randint -from itertools import ifilter +from numpy.random import random_integers def squareFrobeniusNormOfSparse(M): """ @@ -26,3 +26,18 @@ def reservoir(it, k): if j < k: ls[j] = x return ls + +def checkingIndices(M, ratio = 1): + """ + Returns the indices for computing fit values + based on non-zero values as well as sample indices + (the sample size is proportional to the given ratio ([0,1]) and number of matrix columns) + """ + rowSize, colSize = M.shape + nonzeroRows, nonzeroCols = M.nonzero() + nonzeroIndices = [(nonzeroRows[i], nonzeroCols[i]) for i in range(len(nonzeroRows))] + sampledRows = random_integers(0, rowSize - 1, round(ratio*colSize)) + sampledCols = random_integers(0, colSize - 1, round(ratio*colSize)) + sampledIndices = zip(sampledRows, sampledCols) + indices = list(set(sampledIndices + nonzeroIndices)) + return indices
 @@ -1,7 +1,7 @@ from numpy import ones, dot import numpy as np from scipy.sparse import coo_matrix -from commonFunctions import squareFrobeniusNormOfSparse, fitNorm, reservoir +from commonFunctions import squareFrobeniusNormOfSparse, fitNorm, reservoir, checkingIndices from numpy.linalg.linalg import norm from nose.tools import assert_almost_equal from itertools import product @@ -42,4 +42,10 @@ def testSampling(): assert checkedElements.count(sampledElements[i]) == 0 checkedElements.append(sampledElements[i]) assert len(checkedElements) == len(sampledElements) + +def testCheckingIndices(): + D = coo_matrix((ones(6),([0, 1, 2, 3, 4, 5], [0, 1, 1, 2, 3, 3])), shape=(6, 4), dtype=np.uint8).tocsr() + indices = checkingIndices(D) + assert len(indices) >= 6 +
 @@ -7,8 +7,8 @@ import numpy as np import os import fnmatch -from commonFunctions import squareFrobeniusNormOfSparse, fitNorm -from extrescalFunctions import updateA, updateV, matrixFitNormElement, checkingIndices +from commonFunctions import squareFrobeniusNormOfSparse, fitNorm, checkingIndices +from extrescalFunctions import updateA, updateV, matrixFitNormElement __DEF_MAXITER = 50 __DEF_PREHEATNUM = 1 @@ -18,7 +18,7 @@ __DEF_LMBDA = 0 __DEF_EXACT_FIT = False __DEF_MATRIX_FIT_SAMPLE_RATIO = 1 -__DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO = 0.1 +__DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO = 1 def rescal(X, D, rank, **kwargs): """ @@ -82,8 +82,8 @@ def rescal(X, D, rank, **kwargs): lmbda = kwargs.pop('lmbda', __DEF_LMBDA) preheatnum = kwargs.pop('preheatnum', __DEF_PREHEATNUM) exactfit = kwargs.pop('exactfit', __DEF_EXACT_FIT) - matrixSampleRatio = kwargs.pop('matrixSampleRation', __DEF_MATRIX_FIT_SAMPLE_RATIO) - tensorSliceSampleRatio = kwargs.pop('tensorSliceSampleRation', __DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO) + matrixSampleRatio = kwargs.pop('matrixSampleRatio', __DEF_MATRIX_FIT_SAMPLE_RATIO) + tensorSliceSampleRatio = kwargs.pop('tensorSliceSampleRatio', __DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO) if not len(kwargs) == 0: raise ValueError( 'Unknown keywords (%s)' % (kwargs.keys()) ) @@ -222,7 +222,6 @@ def __updateR(X, A, lmbda): def __projectSlices(X, Q): - q = Q.shape[1] X2 = [] for i in xrange(len(X)): X2.append( dot(Q.T, X[i].dot(Q)) ) @@ -234,13 +233,15 @@ def __projectSlices(X, Q): parser.add_argument("--input", type=str, help="the directory, where the input data are stored", required=True) parser.add_argument("--outputentities", type=str, help="the file, where the latent embedding for entities will be output", required=True) parser.add_argument("--outputterms", type=str, help="the file, where the latent embedding for terms will be output", required=True) +parser.add_argument("--outputfactors", type=str, help="the file, where the latent factors will be output", required=True) parser.add_argument("--log", type=str, help="log file", required=True) args = parser.parse_args() numLatentComponents = args.latent inputDir = args.input regularizationParam = args.lmbda outputEntities = args.outputentities outputTerms = args.outputterms +outputFactors = args.outputfactors logFile = args.log logging.basicConfig(filename=logFile, filemode='w', level=logging.DEBUG) @@ -250,19 +251,19 @@ def __projectSlices(X, Q): dim = 0 with open('./%s/entity-ids' % inputDir) as entityIds: for line in entityIds: - dim += 1 + dim += 1 print 'The number of entities: %d' % dim numSlices = 0 numNonzeroTensorEntries = 0 X = [] -for file in os.listdir('./%s' % inputDir): - if fnmatch.fnmatch(file, '[0-9]*-rows'): +for inputFile in os.listdir('./%s' % inputDir): + if fnmatch.fnmatch(inputFile, '[0-9]*-rows'): numSlices += 1 - row = loadtxt('./%s/%s' % (inputDir, file), dtype=np.int32) + row = loadtxt('./%s/%s' % (inputDir, inputFile), dtype=np.int32) if row.size == 1: row = np.atleast_1d(row) - col = loadtxt('./%s/%s' % (inputDir, file.replace("rows", "cols")), dtype=np.int32) + col = loadtxt('./%s/%s' % (inputDir, inputFile.replace("rows", "cols")), dtype=np.int32) if col.size == 1: col = np.atleast_1d(col) Xi = coo_matrix((ones(row.size),(row,col)), shape=(dim,dim), dtype=np.uint8).tolil() @@ -296,4 +297,7 @@ def __projectSlices(X, Q): savetxt(outputEntities, A) V = result[5] savetxt(outputTerms, V.T) - +R = result[1] +with file(outputFactors, 'w') as outfile: + for i in xrange(len(R)): + savetxt(outfile, R[i])
 @@ -1,9 +1,6 @@ import numpy as np from numpy import dot, zeros, eye, empty from numpy.linalg import inv -from commonFunctions import reservoir -from itertools import product -from numpy.random import random_integers def updateA(X, A, R, V, D, lmbda): n, rank = A.shape @@ -35,19 +32,5 @@ def matrixFitNormElement(i, j, D, A, V): """ return (D[i,j] - dot(A[i,:], V[:, j]))**2 -def checkingIndices(M, ratio = 1): - """ - Returns the indices for computing fit values - based on non-zero values as well as sample indices - (the sample size is proportional to the given ratio ([0,1]) and number of matrix columns) - """ - rowSize, colSize = M.shape - nonzeroRows, nonzeroCols = M.nonzero() - nonzeroIndices = [(nonzeroRows[i], nonzeroCols[i]) for i in range(len(nonzeroRows))] - sampledRows = random_integers(0, rowSize - 1, round(ratio*colSize)) - sampledCols = random_integers(0, colSize - 1, round(ratio*colSize)) - sampledIndices = zip(sampledRows, sampledCols) - indices = list(set(sampledIndices + nonzeroIndices)) - return indices
 @@ -1,8 +1,7 @@ from scipy.sparse import coo_matrix from numpy import ones, dot, eye import numpy as np -from extrescalFunctions import updateA, updateV, matrixFitNormElement,\ - checkingIndices +from extrescalFunctions import updateA, updateV, matrixFitNormElement from nose.tools import assert_almost_equal from numpy.linalg import inv from numpy.linalg.linalg import norm @@ -81,8 +80,4 @@ def testMatrixFitNorm(): fit += matrixFitNormElement(i, j, D, A, V) assert_almost_equal(fit, expectedNorm) -def testCheckingIndices(): - D = coo_matrix((ones(6),([0, 1, 2, 3, 4, 5], [0, 1, 1, 2, 3, 3])), shape=(6, 4), dtype=np.uint8).tocsr() - indices = checkingIndices(D) - assert len(indices) >= 6