-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
utils.py
193 lines (149 loc) · 5.46 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python
# encoding: utf-8
"""Module contains common utilities used in automated code tests for Gensim modules.
Attributes:
-----------
module_path : str
Full path to this module directory.
common_texts : list of list of str
Toy dataset.
common_dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
Dictionary of toy dataset.
common_corpus : list of list of (int, int)
Corpus of toy dataset.
Examples:
---------
It's easy to keep objects in temporary folder and reuse'em if needed:
>>> from gensim.models import word2vec
>>> from gensim.test.utils import get_tmpfile, common_texts
>>>
>>> model = word2vec.Word2Vec(common_texts, min_count=1)
>>> temp_path = get_tmpfile('toy_w2v')
>>> model.save(temp_path)
>>>
>>> new_model = word2vec.Word2Vec.load(temp_path)
>>> result = new_model.wv.most_similar("human", topn=1)
Let's print first document in toy dataset and then recreate it using its corpus and dictionary.
>>> from gensim.test.utils import common_texts, common_dictionary, common_corpus
>>> print(common_texts[0])
['human', 'interface', 'computer']
>>> assert common_dictionary.doc2bow(common_texts[0]) == common_corpus[0]
We can find our toy set in test data directory.
>>> from gensim.test.utils import datapath
>>>
>>> with open(datapath("testcorpus.txt")) as f:
... texts = [line.strip().split() for line in f]
>>> print(texts[0])
['computer', 'human', 'interface']
If you don't need to keep temporary objects on disk use :func:`~gensim.test.utils.temporary_file`:
>>> from gensim.test.utils import temporary_file, common_corpus, common_dictionary
>>> from gensim.models import LdaModel
>>>
>>> with temporary_file("temp.txt") as tf:
... lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=3)
... lda.save(tf)
"""
import contextlib
import tempfile
import os
import shutil
from gensim.corpora import Dictionary
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
def datapath(fname):
"""Get full path for file `fname` in test data directory placed in this module directory.
Usually used to place corpus to test_data directory.
Parameters
----------
fname : str
Name of file.
Returns
-------
str
Full path to `fname` in test_data folder.
Example
-------
Let's get path of test GloVe data file and check if it exits.
>>> from gensim.corpora import MmCorpus
>>> from gensim.test.utils import datapath
>>>
>>> corpus = MmCorpus(datapath("testcorpus.mm"))
>>> for document in corpus:
... pass
"""
return os.path.join(module_path, 'test_data', fname)
def get_tmpfile(suffix):
"""Get full path to file `suffix` in temporary folder.
This function doesn't creates file (only generate unique name).
Also, it may return different paths in consecutive calling.
Parameters
----------
suffix : str
Suffix of file.
Returns
-------
str
Path to `suffix` file in temporary folder.
Examples
--------
Using this function we may get path to temporary file and use it, for example, to store temporary model.
>>> from gensim.models import LsiModel
>>> from gensim.test.utils import get_tmpfile, common_dictionary, common_corpus
>>>
>>> tmp_f = get_tmpfile("toy_lsi_model")
>>>
>>> model = LsiModel(common_corpus, id2word=common_dictionary)
>>> model.save(tmp_f)
>>>
>>> loaded_model = LsiModel.load(tmp_f)
"""
return os.path.join(tempfile.gettempdir(), suffix)
@contextlib.contextmanager
def temporary_file(name=""):
"""This context manager creates file `name` in temporary directory and returns its full path.
Temporary directory with included files will deleted at the end of context. Note, it won't create file.
Parameters
----------
name : str
Filename.
Yields
------
str
Path to file `name` in temporary directory.
Examples
--------
This example demonstrates that created temporary directory (and included
files) will deleted at the end of context.
>>> import os
>>> from gensim.test.utils import temporary_file
>>> with temporary_file("temp.txt") as tf, open(tf, 'w') as outfile:
... outfile.write("my extremely useful information")
... print("Is this file exists? {}".format(os.path.exists(tf)))
... print("Is this folder exists? {}".format(os.path.exists(os.path.dirname(tf))))
Is this file exists? True
Is this folder exists? True
>>>
>>> print("Is this file exists? {}".format(os.path.exists(tf)))
Is this file exists? False
>>> print("Is this folder exists? {}".format(os.path.exists(os.path.dirname(tf))))
Is this folder exists? False
"""
# note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory
tmp = tempfile.mkdtemp()
try:
yield os.path.join(tmp, name)
finally:
shutil.rmtree(tmp, ignore_errors=True)
# set up vars used in testing ("Deerwester" from the web tutorial)
common_texts = [
['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']
]
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]