Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merge branch 'release-0.3'

  • Loading branch information...
commit 928fe69bbad2874e67a43c550574b3b36b19cfa2 2 parents 96079e1 + b51033a
@nzhiltsov authored
View
26 README.md
@@ -8,7 +8,7 @@ Ext-RESCAL is a memory efficient implementation of [RESCAL](http://www.cip.ifi.l
Current Version
------------
-[0.2](https://github.com/nzhiltsov/Ext-RESCAL/archive/0.2.zip)
+[0.3](https://github.com/nzhiltsov/Ext-RESCAL/archive/0.3.zip)
Features
------------
@@ -37,15 +37,26 @@ Prerequisites
Usage Examples
----------------------
-1) Run the RESCAL algorithm to decompose a 3-D tensor with 2 latent components and zero regularization on the test data:
+1) Let's imagine we have the following semantic graph:
-<pre>python rescal.py --latent 2 --lmbda 0 --input tiny-example --outputentities entity.embeddings.csv --log rescal.log</pre>
+![semantic-graph](tiny-mixed-example/semantic-graph.png)
+
+Each tensor slice represents an adjacency matrix of the corresponding predicate (member-of, genre, cites). Run the RESCAL algorithm to decompose a 3-D tensor with 2 latent components and zero regularization on the test data:
+
+<pre>python rescal.py --latent 2 --lmbda 0 --input tiny-example --outputentities entity.embeddings.csv --outputfactors latent.factors.csv --log rescal.log</pre>
The test data set represents a tiny entity graph of 3 adjacency matrices (tensor slices) in the row-column representation. See the directory <i>tiny-example</i>. Ext-RESCAL will output the latent factors for the entities into the file <i>entity.embeddings.csv</i>.
-2) Run the extended version of RESCAL algorithm to decompose a 3-D tensor and 2-D matrix with 2 latent components and regularizer equal to 0.001 on the test data (entity graph and entity-term matrix):
+2) Then, we assume that there is an entity-term matrix:
+
+![entity-term-matrix](tiny-mixed-example/entity-term-matrix.png)
+
+Run the extended version of RESCAL algorithm to decompose a 3-D tensor and 2-D matrix with 2 latent components and regularizer equal to 0.001 on the test data (entity graph and entity-term matrix):
-<pre>python extrescal.py --latent 2 --lmbda 0.001 --input tiny-mixed-example --outputentities entity.embeddings.csv --outputterms term.embeddings.csv --log extrescal.log</pre>
+<pre>python extrescal.py --latent 2 --lmbda 0.001 --input tiny-mixed-example --outputentities entity.embeddings.csv --outputterms term.embeddings.csv --outputfactors latent.factors.csv --log extrescal.log</pre>
+
+If we plot the resulting embeddings, we would get the following picture, which reveals the similarity of entities and words in the latent space:
+![latent-space-visualization](tiny-mixed-example/TinyMixedExample.png)
Development and Contribution
----------------------
@@ -55,6 +66,11 @@ This is a fork of the original code base provided by [Maximilian Nickel](http://
Release Notes
------------
+0.3 (March 12, 2013):
+
+* Fix random sampling for the basic task
+* Add output of latent factors
+
0.2 (February 26, 2013):
* Add an opportunity to approximate the objective function via random sampling
View
17 commonFunctions.py
@@ -1,6 +1,6 @@
from numpy import dot
from numpy.random import randint
-from itertools import ifilter
+from numpy.random import random_integers
def squareFrobeniusNormOfSparse(M):
"""
@@ -26,3 +26,18 @@ def reservoir(it, k):
if j < k:
ls[j] = x
return ls
+
+def checkingIndices(M, ratio = 1):
+ """
+ Returns the indices for computing fit values
+ based on non-zero values as well as sample indices
+ (the sample size is proportional to the given ratio ([0,1]) and number of matrix columns)
+ """
+ rowSize, colSize = M.shape
+ nonzeroRows, nonzeroCols = M.nonzero()
+ nonzeroIndices = [(nonzeroRows[i], nonzeroCols[i]) for i in range(len(nonzeroRows))]
+ sampledRows = random_integers(0, rowSize - 1, round(ratio*colSize))
+ sampledCols = random_integers(0, colSize - 1, round(ratio*colSize))
+ sampledIndices = zip(sampledRows, sampledCols)
+ indices = list(set(sampledIndices + nonzeroIndices))
+ return indices
View
8 commonFunctionsTest.py
@@ -1,7 +1,7 @@
from numpy import ones, dot
import numpy as np
from scipy.sparse import coo_matrix
-from commonFunctions import squareFrobeniusNormOfSparse, fitNorm, reservoir
+from commonFunctions import squareFrobeniusNormOfSparse, fitNorm, reservoir, checkingIndices
from numpy.linalg.linalg import norm
from nose.tools import assert_almost_equal
from itertools import product
@@ -42,4 +42,10 @@ def testSampling():
assert checkedElements.count(sampledElements[i]) == 0
checkedElements.append(sampledElements[i])
assert len(checkedElements) == len(sampledElements)
+
+def testCheckingIndices():
+ D = coo_matrix((ones(6),([0, 1, 2, 3, 4, 5], [0, 1, 1, 2, 3, 3])), shape=(6, 4), dtype=np.uint8).tocsr()
+ indices = checkingIndices(D)
+ assert len(indices) >= 6
+
View
28 extrescal.py
@@ -7,8 +7,8 @@
import numpy as np
import os
import fnmatch
-from commonFunctions import squareFrobeniusNormOfSparse, fitNorm
-from extrescalFunctions import updateA, updateV, matrixFitNormElement, checkingIndices
+from commonFunctions import squareFrobeniusNormOfSparse, fitNorm, checkingIndices
+from extrescalFunctions import updateA, updateV, matrixFitNormElement
__DEF_MAXITER = 50
__DEF_PREHEATNUM = 1
@@ -18,7 +18,7 @@
__DEF_LMBDA = 0
__DEF_EXACT_FIT = False
__DEF_MATRIX_FIT_SAMPLE_RATIO = 1
-__DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO = 0.1
+__DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO = 1
def rescal(X, D, rank, **kwargs):
"""
@@ -82,8 +82,8 @@ def rescal(X, D, rank, **kwargs):
lmbda = kwargs.pop('lmbda', __DEF_LMBDA)
preheatnum = kwargs.pop('preheatnum', __DEF_PREHEATNUM)
exactfit = kwargs.pop('exactfit', __DEF_EXACT_FIT)
- matrixSampleRatio = kwargs.pop('matrixSampleRation', __DEF_MATRIX_FIT_SAMPLE_RATIO)
- tensorSliceSampleRatio = kwargs.pop('tensorSliceSampleRation', __DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO)
+ matrixSampleRatio = kwargs.pop('matrixSampleRatio', __DEF_MATRIX_FIT_SAMPLE_RATIO)
+ tensorSliceSampleRatio = kwargs.pop('tensorSliceSampleRatio', __DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO)
if not len(kwargs) == 0:
raise ValueError( 'Unknown keywords (%s)' % (kwargs.keys()) )
@@ -222,7 +222,6 @@ def __updateR(X, A, lmbda):
def __projectSlices(X, Q):
- q = Q.shape[1]
X2 = []
for i in xrange(len(X)):
X2.append( dot(Q.T, X[i].dot(Q)) )
@@ -234,6 +233,7 @@ def __projectSlices(X, Q):
parser.add_argument("--input", type=str, help="the directory, where the input data are stored", required=True)
parser.add_argument("--outputentities", type=str, help="the file, where the latent embedding for entities will be output", required=True)
parser.add_argument("--outputterms", type=str, help="the file, where the latent embedding for terms will be output", required=True)
+parser.add_argument("--outputfactors", type=str, help="the file, where the latent factors will be output", required=True)
parser.add_argument("--log", type=str, help="log file", required=True)
args = parser.parse_args()
numLatentComponents = args.latent
@@ -241,6 +241,7 @@ def __projectSlices(X, Q):
regularizationParam = args.lmbda
outputEntities = args.outputentities
outputTerms = args.outputterms
+outputFactors = args.outputfactors
logFile = args.log
logging.basicConfig(filename=logFile, filemode='w', level=logging.DEBUG)
@@ -250,19 +251,19 @@ def __projectSlices(X, Q):
dim = 0
with open('./%s/entity-ids' % inputDir) as entityIds:
for line in entityIds:
- dim += 1
+ dim += 1
print 'The number of entities: %d' % dim
numSlices = 0
numNonzeroTensorEntries = 0
X = []
-for file in os.listdir('./%s' % inputDir):
- if fnmatch.fnmatch(file, '[0-9]*-rows'):
+for inputFile in os.listdir('./%s' % inputDir):
+ if fnmatch.fnmatch(inputFile, '[0-9]*-rows'):
numSlices += 1
- row = loadtxt('./%s/%s' % (inputDir, file), dtype=np.int32)
+ row = loadtxt('./%s/%s' % (inputDir, inputFile), dtype=np.int32)
if row.size == 1:
row = np.atleast_1d(row)
- col = loadtxt('./%s/%s' % (inputDir, file.replace("rows", "cols")), dtype=np.int32)
+ col = loadtxt('./%s/%s' % (inputDir, inputFile.replace("rows", "cols")), dtype=np.int32)
if col.size == 1:
col = np.atleast_1d(col)
Xi = coo_matrix((ones(row.size),(row,col)), shape=(dim,dim), dtype=np.uint8).tolil()
@@ -296,4 +297,7 @@ def __projectSlices(X, Q):
savetxt(outputEntities, A)
V = result[5]
savetxt(outputTerms, V.T)
-
+R = result[1]
+with file(outputFactors, 'w') as outfile:
+ for i in xrange(len(R)):
+ savetxt(outfile, R[i])
View
17 extrescalFunctions.py
@@ -1,9 +1,6 @@
import numpy as np
from numpy import dot, zeros, eye, empty
from numpy.linalg import inv
-from commonFunctions import reservoir
-from itertools import product
-from numpy.random import random_integers
def updateA(X, A, R, V, D, lmbda):
n, rank = A.shape
@@ -35,19 +32,5 @@ def matrixFitNormElement(i, j, D, A, V):
"""
return (D[i,j] - dot(A[i,:], V[:, j]))**2
-def checkingIndices(M, ratio = 1):
- """
- Returns the indices for computing fit values
- based on non-zero values as well as sample indices
- (the sample size is proportional to the given ratio ([0,1]) and number of matrix columns)
- """
- rowSize, colSize = M.shape
- nonzeroRows, nonzeroCols = M.nonzero()
- nonzeroIndices = [(nonzeroRows[i], nonzeroCols[i]) for i in range(len(nonzeroRows))]
- sampledRows = random_integers(0, rowSize - 1, round(ratio*colSize))
- sampledCols = random_integers(0, colSize - 1, round(ratio*colSize))
- sampledIndices = zip(sampledRows, sampledCols)
- indices = list(set(sampledIndices + nonzeroIndices))
- return indices
View
7 extrescalFunctionsTest.py
@@ -1,8 +1,7 @@
from scipy.sparse import coo_matrix
from numpy import ones, dot, eye
import numpy as np
-from extrescalFunctions import updateA, updateV, matrixFitNormElement,\
- checkingIndices
+from extrescalFunctions import updateA, updateV, matrixFitNormElement
from nose.tools import assert_almost_equal
from numpy.linalg import inv
from numpy.linalg.linalg import norm
@@ -81,8 +80,4 @@ def testMatrixFitNorm():
fit += matrixFitNormElement(i, j, D, A, V)
assert_almost_equal(fit, expectedNorm)
-def testCheckingIndices():
- D = coo_matrix((ones(6),([0, 1, 2, 3, 4, 5], [0, 1, 1, 2, 3, 3])), shape=(6, 4), dtype=np.uint8).tocsr()
- indices = checkingIndices(D)
- assert len(indices) >= 6
View
49 rescal.py
@@ -1,5 +1,5 @@
import logging, time, argparse
-from numpy import dot, zeros, empty, kron, array, eye, argmin, ones, savetxt, loadtxt
+from numpy import dot, zeros, kron, array, eye, ones, savetxt, loadtxt
from numpy.linalg import qr, pinv, norm, inv
from numpy.random import rand
from scipy import sparse
@@ -7,9 +7,8 @@
import numpy as np
import os
import fnmatch
-from commonFunctions import squareFrobeniusNormOfSparse, fitNorm
+from commonFunctions import squareFrobeniusNormOfSparse, fitNorm, checkingIndices
-__version__ = "0.1"
__DEF_MAXITER = 50
__DEF_PREHEATNUM = 1
@@ -18,6 +17,7 @@
__DEF_CONV = 1e-5
__DEF_LMBDA = 0
__DEF_EXACT_FIT = False
+__DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO = 1
def rescal(X, rank, **kwargs):
"""
@@ -78,6 +78,7 @@ def rescal(X, rank, **kwargs):
lmbda = kwargs.pop('lmbda', __DEF_LMBDA)
preheatnum = kwargs.pop('preheatnum', __DEF_PREHEATNUM)
exactfit = kwargs.pop('exactfit', __DEF_EXACT_FIT)
+ tensorSliceSampleRatio = kwargs.pop('tensorSliceSampleRatio', __DEF_TENSOR_SLICE_FIT_SAMPLE_RATIO)
if not len(kwargs) == 0:
raise ValueError( 'Unknown keywords (%s)' % (kwargs.keys()) )
@@ -119,8 +120,14 @@ def rescal(X, rank, **kwargs):
# compute factorization
fit = fitchange = fitold = 0
exectimes = []
+
+ if exactfit:
+ tensorFitIndices = []
+ else :
+ tensorFitIndices = [checkingIndices(M, ratio = tensorSliceSampleRatio) for M in X]
+ _log.debug('[Algorithm] Finished sampling of indices to compute the fit values.')
- for iter in xrange(maxIter):
+ for iterNum in xrange(maxIter):
tic = time.clock()
A = updateA(X, A, R, lmbda)
@@ -136,7 +143,7 @@ def rescal(X, rank, **kwargs):
fit = 0
regularizedFit = 0
regRFit = 0
- if iter > preheatnum:
+ if iterNum > preheatnum:
if lmbda != 0:
for i in xrange(len(R)):
regRFit += norm(R[i])**2
@@ -148,9 +155,10 @@ def rescal(X, rank, **kwargs):
else :
for i in xrange(len(R)):
ARk = dot(A, R[i])
- Xrow, Xcol = X[i].nonzero()
- for rr in xrange(len(Xrow)):
- fit += fitNorm(Xrow[rr], Xcol[rr], X[i], ARk, A)
+ iTensorFitIndices = tensorFitIndices[i]
+ for rr in xrange(len(iTensorFitIndices)):
+ m, l = iTensorFitIndices[rr]
+ fit += fitNorm(m, l, X[i], ARk, A)
fit *= 0.5
fit += regularizedFit
fit /= sumNormX
@@ -160,13 +168,13 @@ def rescal(X, rank, **kwargs):
toc = time.clock()
exectimes.append( toc - tic )
fitchange = abs(fitold - fit)
- _log.debug('[%3d] total fit: %.10f | delta: %.10f | secs: %.5f' % (iter,
+ _log.debug('[%3d] total fit: %.10f | delta: %.10f | secs: %.5f' % (iterNum,
fit, fitchange, exectimes[-1]))
fitold = fit
- if iter > preheatnum and fitchange < conv:
+ if iterNum > preheatnum and fitchange < conv:
break
- return A, R, fit, iter+1, array(exectimes)
+ return A, R, fit, iterNum+1, array(exectimes)
def updateA(X, A, R, lmbda):
n, rank = A.shape
@@ -199,7 +207,6 @@ def __updateR(X, A, lmbda):
return R
def __projectSlices(X, Q):
- q = Q.shape[1]
X2 = []
for i in xrange(len(X)):
X2.append( dot(Q.T, X[i].dot(Q)) )
@@ -210,12 +217,14 @@ def __projectSlices(X, Q):
parser.add_argument("--lmbda", type=float, help="regularization parameter", required=True)
parser.add_argument("--input", type=str, help="the directory, where the input data are stored", required=True)
parser.add_argument("--outputentities", type=str, help="the file, where the latent embedding for entities will be output", required=True)
+parser.add_argument("--outputfactors", type=str, help="the file, where the latent factors will be output", required=True)
parser.add_argument("--log", type=str, help="log file", required=True)
args = parser.parse_args()
numLatentComponents = args.latent
inputDir = args.input
regularizationParam = args.lmbda
outputEntities = args.outputentities
+outputFactors = args.outputfactors
logFile = args.log
logging.basicConfig(filename=logFile, filemode='w', level=logging.DEBUG)
@@ -224,19 +233,19 @@ def __projectSlices(X, Q):
dim = 0
with open('./%s/entity-ids' % inputDir) as entityIds:
for line in entityIds:
- dim += 1
+ dim += 1
print 'The number of entities: %d' % dim
numSlices = 0
numNonzeroTensorEntries = 0
X = []
-for file in os.listdir('./%s' % inputDir):
- if fnmatch.fnmatch(file, '[0-9]*-rows'):
+for inputFile in os.listdir('./%s' % inputDir):
+ if fnmatch.fnmatch(inputFile, '[0-9]*-rows'):
numSlices += 1
- row = loadtxt('./%s/%s' % (inputDir, file), dtype=np.int32)
+ row = loadtxt('./%s/%s' % (inputDir, inputFile), dtype=np.int32)
if row.size == 1:
row = np.atleast_1d(row)
- col = loadtxt('./%s/%s' % (inputDir, file.replace("rows", "cols")), dtype=np.int32)
+ col = loadtxt('./%s/%s' % (inputDir, inputFile.replace("rows", "cols")), dtype=np.int32)
if col.size == 1:
col = np.atleast_1d(col)
Xi = coo_matrix((ones(row.size),(row,col)), shape=(dim,dim), dtype=np.uint8).tolil()
@@ -249,7 +258,11 @@ def __projectSlices(X, Q):
result = rescal(X, numLatentComponents, init='random', lmbda=regularizationParam)
print 'Objective function value: %.30f' % result[2]
print '# of iterations: %d' % result[3]
-#print the matrix of latent embeddings
+#print the matrix of latent embeddings and matrix of latent factors
A = result[0]
savetxt(outputEntities, A)
+R = result[1]
+with file(outputFactors, 'w') as outfile:
+ for i in xrange(len(R)):
+ savetxt(outfile, R[i])
View
BIN  tiny-mixed-example/TinyMixedExample.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View
BIN  tiny-mixed-example/entity-term-matrix.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View
8 tiny-mixed-example/semantic-graph.dot
@@ -0,0 +1,8 @@
+digraph semantic_graph {
+"dbr:Vibeke" -> "dbr:Tristania" [label="member-of"];
+"dbr:Morten" -> "dbr:Tristania" [label="member-of"];
+"dbr:Tristania" -> "dbr:Metal" [label="genre"];
+"author1" -> "author2" [label="cites"];
+"author1" -> "author1" [label="cites"];
+"author2" -> "author2" [label="cites"];
+}
View
BIN  tiny-mixed-example/semantic-graph.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Please sign in to comment.
Something went wrong with that request. Please try again.