/
utils.py
148 lines (120 loc) · 4.34 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Oliver Borchers <borchers@bwl.uni-mannheim.de>
# Copyright (C) 2019 Oliver Borchers
from sklearn.decomposition import TruncatedSVD
from numpy import ndarray, float32 as REAL, ones, vstack, inf as INF, dtype
from numpy.random import choice
from time import time
import logging
from sys import platform
import ctypes
logger = logging.getLogger(__name__)
def set_madvise_for_mmap(return_madvise: bool = False) -> object:
""" Method used to set madvise parameters.
This problem adresses the memmap issue raised in https://github.com/numpy/numpy/issues/13172
The issue is not applicable for windows
Parameters
----------
return_madvise : bool
Returns the madvise object for unittests, se test_utils.py
Returns
-------
object
madvise object
"""
if platform in ["linux", "linux2", "darwin", "aix"]:
if platform == "darwin":
# Path different for Macos
madvise = ctypes.CDLL("libc.dylib").madvise
if platform in ["linux", "linux2", "aix"]:
madvise = ctypes.CDLL("libc.so.6").madvise
madvise.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
madvise.restype = ctypes.c_int
if return_madvise:
return madvise
def compute_principal_components(
vectors: ndarray, components: int = 1, cache_size_gb: float = 1.0
) -> [ndarray, ndarray]:
""" Method used to compute the first singular vectors of a given (sub)matrix
Parameters
----------
vectors : ndarray
(Sentence) vectors to compute the truncated SVD on
components : int, optional
Number of singular values/vectors to compute
cache_size_gb : float, optional
Cache size for computing the principal components in GB
Returns
-------
ndarray, ndarray
Singular values and singular vectors
"""
start = time()
svd = TruncatedSVD(
n_components=components, n_iter=7, random_state=42, algorithm="randomized"
)
current_mem = INF
sample_size = len(vectors)
while 1:
current_mem = sample_size * vectors.shape[1] * dtype(REAL).itemsize / 1024 ** 3
if current_mem < cache_size_gb:
break
sample_size *= 0.995
sample_size = int(sample_size)
if sample_size < len(vectors):
logger.info(f"sampling {sample_size} vectors to compute principal components")
sample_indices = choice(range(vectors.shape[0]), replace=False, size=int(1e6))
svd.fit(vstack([vectors[i] for i in sample_indices]))
else:
svd.fit(vectors)
elapsed = time()
logger.info(
f"computing {components} principal components took {int(elapsed-start)}s"
)
return svd.singular_values_.astype(REAL), svd.components_.astype(REAL)
def remove_principal_components(
vectors: ndarray,
svd_res: [ndarray, ndarray],
weights: ndarray = None,
inplace: bool = True,
) -> ndarray:
""" Method used to remove the first singular vectors of a given matrix
Parameters
----------
vectors : ndarray
(Sentence) vectors to remove components fromm
svd_res : (ndarray, ndarray)
Tuple consisting of the singular values and components to remove from the vectors
weights : ndarray, optional
Weights to be used to weigh the components which are removed from the vectors
inplace : bool, optional
If true, removes the componentens from the vectors inplace (memory efficient)
Returns
-------
ndarray, ndarray
Singular values and singular vectors
"""
components = svd_res[1].astype(REAL)
start = time()
if weights is None:
w_comp = components * ones(len(components), dtype=REAL)[:, None]
else:
w_comp = components * (weights[:, None].astype(REAL))
output = None
if len(components) == 1:
if not inplace:
output = vectors.dot(w_comp.transpose()) * w_comp
else:
vectors -= vectors.dot(w_comp.transpose()) * w_comp
else:
if not inplace:
output = vectors.dot(w_comp.transpose()).dot(w_comp)
else:
vectors -= vectors.dot(w_comp.transpose()).dot(w_comp)
elapsed = time()
logger.info(
f"removing {len(components)} principal components took {int(elapsed-start)}s"
)
if not inplace:
return output