-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
_matutils.pyx
359 lines (270 loc) · 8.76 KB
/
_matutils.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
#!/usr/bin/env cython
# coding: utf-8
# cython: embedsignature=True
from __future__ import division
cimport cython
import numpy as np
cimport numpy as np
ctypedef cython.floating DTYPE_t
from libc.math cimport log, exp, fabs
from cython.parallel import prange
def mean_absolute_difference(a, b):
"""Mean absolute difference between two arrays, using :func:`~gensim._matutils._mean_absolute_difference`.
Parameters
----------
a : numpy.ndarray
Input 1d array, supports float16, float32 and float64.
b : numpy.ndarray
Input 1d array, supports float16, float32 and float64.
Returns
-------
float
mean(abs(a - b)).
"""
if a.shape != b.shape:
raise ValueError("a and b must have same shape")
if a.dtype == np.float64:
return _mean_absolute_difference[double](a, b)
elif a.dtype == np.float32:
return _mean_absolute_difference[float](a, b)
elif a.dtype == np.float16:
return _mean_absolute_difference[float](a.astype(np.float32), b.astype(np.float32))
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cdef DTYPE_t _mean_absolute_difference(DTYPE_t[:] a, DTYPE_t[:] b) nogil:
"""Mean absolute difference between two arrays.
Parameters
----------
a : numpy.ndarray
Input 1d array.
b : numpy.ndarray
Input 1d array.
Returns
-------
DTYPE_t
mean(abs(a - b))
"""
cdef DTYPE_t result = 0.0
cdef size_t i
cdef size_t j
cdef size_t I = a.shape[0]
cdef size_t N = I
for i in range(I):
result += fabs(a[i] - b[i])
result /= N
return result
def logsumexp(x):
"""Log of sum of exponentials, using :func:`~gensim._matutils._logsumexp_2d`.
Parameters
----------
x : numpy.ndarray
Input 2d matrix, supports float16, float32 and float64.
Returns
-------
float
log of sum of exponentials of elements in `x`.
Warnings
--------
By performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.
"""
if x.dtype == np.float64:
return _logsumexp_2d[double](x)
elif x.dtype == np.float32:
return _logsumexp_2d[float](x)
elif x.dtype == np.float16:
return _logsumexp_2d[float](x.astype(np.float32))
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cdef DTYPE_t _logsumexp_2d(DTYPE_t[:, :] data) nogil:
"""Log of sum of exponentials.
Parameters
----------
x : numpy.ndarray
Input 2d matrix.
Returns
-------
DTYPE_t
log of sum of exponentials of elements in `data`.
"""
cdef DTYPE_t max_val = data[0, 0]
cdef DTYPE_t result = 0.0
cdef size_t i
cdef size_t j
cdef size_t I = data.shape[0]
cdef size_t J = data.shape[1]
for i in range(I):
for j in range(J):
if data[i, j] > max_val:
max_val = data[i, j]
for i in range(I):
for j in range(J):
result += exp(data[i, j] - max_val)
result = log(result) + max_val
return result
def dirichlet_expectation(alpha):
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Using :func:`~gensim._matutils.dirichlet_expectation_1d` or :func:`~gensim._matutils.dirichlet_expectation_2d`.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector,
supports float16, float32 and float64.
Returns
-------
numpy.ndarray
Log of expected values, dimension same as `alpha.ndim`.
"""
if alpha.ndim == 2:
return dirichlet_expectation_2d(alpha)
else:
return dirichlet_expectation_1d(alpha)
def dirichlet_expectation_2d(alpha):
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Using :func:`~gensim._matutils._dirichlet_expectation_2d`.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 2d matrix, each row is treated as a separate parameter vector,
supports float16, float32 and float64.
Returns
-------
numpy.ndarray
Log of expected values, 2d matrix.
"""
if alpha.dtype == np.float64:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_2d[double](alpha, out)
elif alpha.dtype == np.float32:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_2d[float](alpha, out)
elif alpha.dtype == np.float16:
out = np.zeros(alpha.shape, dtype=np.float32)
_dirichlet_expectation_2d[float](alpha.astype(np.float32), out)
out = out.astype(np.float16)
return out
def dirichlet_expectation_1d(alpha):
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Using :func:`~gensim._matutils._dirichlet_expectation_1d`.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 1d vector, supports float16, float32 and float64.
Returns
-------
numpy.ndarray
Log of expected values, 1d vector.
"""
if alpha.dtype == np.float64:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_1d[double](alpha, out)
elif alpha.dtype == np.float32:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_1d[float](alpha, out)
elif alpha.dtype == np.float16:
out = np.zeros(alpha.shape, dtype=np.float32)
_dirichlet_expectation_1d[float](alpha.astype(np.float32), out)
out = out.astype(np.float16)
return out
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _dirichlet_expectation_1d(DTYPE_t[:] alpha, DTYPE_t[:] out) nogil:
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 1d vector.
out : numpy.ndarray
Output array, contains log of expected values.
"""
cdef DTYPE_t sum_alpha = 0.0
cdef DTYPE_t psi_sum_alpha = 0.0
cdef size_t i
cdef size_t I = alpha.shape[0]
for i in range(I):
sum_alpha += alpha[i]
psi_sum_alpha = _digamma(sum_alpha)
for i in range(I):
out[i] = _digamma(alpha[i]) - psi_sum_alpha
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _dirichlet_expectation_2d(DTYPE_t[:, :] alpha, DTYPE_t[:, :] out) nogil:
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter matrix, each row is treated as a parameter vector for its own Dirichlet.
out : numpy.ndarray
Log of expected values, 2d matrix.
"""
cdef DTYPE_t sum_alpha = 0.0
cdef DTYPE_t psi_sum_alpha = 0.0
cdef size_t i, j
cdef size_t I = alpha.shape[0]
cdef size_t J = alpha.shape[1]
for i in range(I):
sum_alpha = 0.0
for j in range(J):
sum_alpha += alpha[i, j]
psi_sum_alpha = _digamma(sum_alpha)
for j in range(J):
out[i, j] = _digamma(alpha[i, j]) - psi_sum_alpha
def digamma(DTYPE_t x):
"""Digamma function for positive floats, using :func:`~gensim._matutils._digamma`.
Parameters
----------
x : float
Positive value.
Returns
-------
float
Digamma(x).
"""
return _digamma(x)
@cython.cdivision(True)
cdef inline DTYPE_t _digamma(DTYPE_t x,) nogil:
"""Digamma function for positive floats.
Parameters
----------
x : float
Positive value.
Notes
-----
Adapted from:
* Authors:
* Original FORTRAN77 version by Jose Bernardo.
* C version by John Burkardt.
* Reference: Jose Bernardo, Algorithm AS 103: Psi (Digamma) Function,
Applied Statistics, Volume 25, Number 3, 1976, pages 315-317.
* Licensing: This code is distributed under the GNU LGPL license.
Returns
-------
float
Digamma(x).
"""
cdef DTYPE_t c = 8.5;
cdef DTYPE_t euler_mascheroni = 0.57721566490153286060;
cdef DTYPE_t r;
cdef DTYPE_t value;
cdef DTYPE_t x2;
if ( x <= 0.000001 ):
value = - euler_mascheroni - 1.0 / x + 1.6449340668482264365 * x;
return value;
# Reduce to DIGAMA(X + N).
value = 0.0;
x2 = x;
while ( x2 < c ):
value = value - 1.0 / x2;
x2 = x2 + 1.0;
# Use Stirling's (actually de Moivre's) expansion.
r = 1.0 / x2;
value = value + log ( x2 ) - 0.5 * r;
r = r * r;
value = value \
- r * ( 1.0 / 12.0 \
- r * ( 1.0 / 120.0 \
- r * ( 1.0 / 252.0 \
- r * ( 1.0 / 240.0 \
- r * ( 1.0 / 132.0 ) ) ) ) )
return value;