/
svd_error.py
executable file
·181 lines (147 loc) · 7.11 KB
/
svd_error.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
"""USAGE: %(program)s MATRIX.mm [CLIP_DOCS] [CLIP_TERMS]
Check truncated SVD error for the algo in gensim, using a given corpus. This script
runs the decomposition with several internal parameters (number of requested factors,
iterative chunk size) and reports error for each parameter combination.
The number of input documents is clipped to the first CLIP_DOCS. Similarly,
only the first CLIP_TERMS are considered (features with id >= CLIP_TERMS are
ignored, effectively restricting the vocabulary size). If you don't specify them,
the entire matrix will be used.
Example: ./svd_error.py ~/gensim/results/wiki_en_v10k.mm.bz2 100000 10000
"""
from __future__ import with_statement
import logging
import os, sys, time
import bz2
import itertools
import numpy
import scipy.linalg
import gensim
try:
from sparsesvd import sparsesvd
except ImportError:
# no SVDLIBC: install with `easy_install sparsesvd` if you want SVDLIBC results as well
sparsesvd = None
sparsesvd = None # don't use SVDLIBC
FACTORS = [300] # which num_topics to try
CHUNKSIZE = [10000, 1000] # which chunksize to try
POWER_ITERS = [0, 1, 2, 4, 6] # extra power iterations for the randomized algo
# when reporting reconstruction error, also report spectral norm error? (very slow)
COMPUTE_NORM2 = False
def norm2(a):
"""Spectral norm ("norm 2") of a symmetric matrix `a`."""
if COMPUTE_NORM2:
logging.info("computing spectral norm of a %s matrix" % str(a.shape))
return scipy.linalg.eigvalsh(a).max() # much faster than numpy.linalg.norm(2)
else:
return numpy.nan
def rmse(diff):
return numpy.sqrt(1.0 * numpy.multiply(diff, diff).sum() / diff.size)
def print_error(name, aat, u, s, ideal_nf, ideal_n2):
err = -numpy.dot(u, numpy.dot(numpy.diag(s), u.T))
err += aat
nf, n2 = numpy.linalg.norm(err), norm2(err)
print ('%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g' %
(name, nf, nf / ideal_nf, n2, n2 / ideal_n2, rmse(err)))
sys.stdout.flush()
class ClippedCorpus(object):
def __init__(self, corpus, max_docs, max_terms):
self.corpus = corpus
self.max_docs, self.max_terms = max_docs, max_terms
def __iter__(self):
for doc in itertools.islice(self.corpus, self.max_docs):
yield [(f, w) for f, w in doc if f < self.max_terms]
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))
program = os.path.basename(sys.argv[0])
# do we have enough cmd line arguments?
if len(sys.argv) < 2:
print globals()["__doc__"] % locals()
sys.exit(1)
fname = sys.argv[1]
if fname.endswith('bz2'):
mm = gensim.corpora.MmCorpus(bz2.BZ2File(fname))
else:
mm = gensim.corpora.MmCorpus(fname)
# extra cmd parameters = use a subcorpus (fewer docs, smaller vocab)
if len(sys.argv) > 2:
n = int(sys.argv[2])
else:
n = mm.num_docs
if len(sys.argv) > 3:
m = int(sys.argv[3])
else:
m = mm.num_terms
logging.info("using %i documents and %i features" % (n, m))
corpus = ClippedCorpus(mm, n, m)
id2word = gensim.utils.FakeDict(m)
logging.info("computing corpus * corpus^T") # eigenvalues of this matrix are singular values of `corpus`, squared
aat = numpy.zeros((m, m), dtype=numpy.float64)
for chunk in gensim.utils.grouper(corpus, chunksize=5000):
num_nnz = sum(len(doc) for doc in chunk)
chunk = gensim.matutils.corpus2csc(chunk, num_nnz=num_nnz, num_terms=m, num_docs=len(chunk), dtype=numpy.float32)
chunk = chunk * chunk.T
chunk = chunk.toarray()
aat += chunk
del chunk
logging.info("computing full decomposition of corpus * corpus^t")
aat = aat.astype(numpy.float32)
spectrum_s, spectrum_u = scipy.linalg.eigh(aat)
spectrum_s = spectrum_s[::-1] # re-order to descending eigenvalue order
spectrum_u = spectrum_u.T[::-1].T
numpy.save(fname + '.spectrum.npy', spectrum_s)
for factors in FACTORS:
err = -numpy.dot(spectrum_u[:, :factors], numpy.dot(numpy.diag(spectrum_s[:factors]), spectrum_u[:, :factors].T))
err += aat
ideal_fro = numpy.linalg.norm(err)
del err
ideal_n2 = spectrum_s[factors + 1]
print '*' * 40, "%i factors, ideal error norm_frobenius=%f, norm_2=%f" % (factors, ideal_fro, ideal_n2)
print "*" * 30,
print_error("baseline", aat,
numpy.zeros((m, factors)), numpy.zeros((factors)), ideal_fro, ideal_n2)
if sparsesvd:
logging.info("computing SVDLIBC SVD for %i factors" % (factors))
taken = time.time()
corpus_ram = gensim.matutils.corpus2csc(corpus, num_terms=m)
ut, s, vt = sparsesvd(corpus_ram, factors)
taken = time.time() - taken
del corpus_ram
del vt
u, s = ut.T.astype(numpy.float32), s.astype(numpy.float32)**2 # convert singular values to eigenvalues
del ut
print "SVDLIBC SVD for %i factors took %s s (spectrum %f .. %f)" % (factors, taken, s[0], s[-1])
print_error("SVDLIBC", aat, u, s, ideal_fro, ideal_n2)
del u
for power_iters in POWER_ITERS:
for chunksize in CHUNKSIZE:
logging.info("computing incremental SVD for %i factors, %i power iterations, chunksize %i" %
(factors, power_iters, chunksize))
taken = time.time()
gensim.models.lsimodel.P2_EXTRA_ITERS = power_iters
model = gensim.models.LsiModel(corpus, id2word=id2word, num_topics=factors,
chunksize=chunksize, power_iters=power_iters)
taken = time.time() - taken
u, s = model.projection.u.astype(numpy.float32), model.projection.s.astype(numpy.float32)**2
del model
print ("incremental SVD for %i factors, %i power iterations, chunksize %i took %s s (spectrum %f .. %f)" %
(factors, power_iters, chunksize, taken, s[0], s[-1]))
print_error('incremental SVD', aat, u, s, ideal_fro, ideal_n2)
del u
logging.info("computing multipass SVD for %i factors, %i power iterations" %
(factors, power_iters,))
taken = time.time()
model = gensim.models.LsiModel(corpus, id2word=id2word, num_topics=factors, chunksize=2000,
onepass=False, power_iters=power_iters)
taken = time.time() - taken
u, s = model.projection.u.astype(numpy.float32), model.projection.s.astype(numpy.float32)**2
del model
print ("multipass SVD for %i factors, %i power iterations took %s s (spectrum %f .. %f)" %
(factors, power_iters, taken, s[0], s[-1]))
print_error('multipass SVD', aat, u, s, ideal_fro, ideal_n2)
del u
logging.info("finished running %s" % program)