-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
probability_estimation.py
263 lines (227 loc) · 9.65 KB
/
probability_estimation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module contains functions to perform segmentation on a list of topics."""
import itertools
import logging
from gensim.topic_coherence.text_analysis import (
CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator,
WordVectorsAccumulator,
)
logger = logging.getLogger(__name__)
def p_boolean_document(corpus, segmented_topics):
"""Perform the boolean document probability estimation. Boolean document estimates the probability of a single word
as the number of documents in which the word occurs divided by the total number of documents.
Parameters
----------
corpus : iterable of list of (int, int)
The corpus of documents.
segmented_topics: list of (int, int).
Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers.
Returns
-------
:class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator`
Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies.
Examples
---------
.. sourcecode:: pycon
>>> from gensim.topic_coherence import probability_estimation
>>> from gensim.corpora.hashdictionary import HashDictionary
>>>
>>>
>>> texts = [
... ['human', 'interface', 'computer'],
... ['eps', 'user', 'interface', 'system'],
... ['system', 'human', 'system', 'eps'],
... ['user', 'response', 'time'],
... ['trees'],
... ['graph', 'trees']
... ]
>>> dictionary = HashDictionary(texts)
>>> w2id = dictionary.token2id
>>>
>>> # create segmented_topics
>>> segmented_topics = [
... [
... (w2id['system'], w2id['graph']),
... (w2id['computer'], w2id['graph']),
... (w2id['computer'], w2id['system'])
... ],
... [
... (w2id['computer'], w2id['graph']),
... (w2id['user'], w2id['graph']),
... (w2id['user'], w2id['computer'])]
... ]
>>> # create corpus
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>>
>>> result = probability_estimation.p_boolean_document(corpus, segmented_topics)
>>> result.index_to_dict()
{10608: set([0]), 12736: set([1, 3]), 18451: set([5]), 5798: set([1, 2])}
"""
top_ids = unique_ids_from_segments(segmented_topics)
return CorpusAccumulator(top_ids).accumulate(corpus)
def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1):
"""Perform the boolean sliding window probability estimation.
Parameters
----------
texts : iterable of iterable of str
Input text
segmented_topics: list of (int, int)
Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
Gensim dictionary mapping of the tokens and ids.
window_size : int
Size of the sliding window, 110 found out to be the ideal size for large corpora.
processes : int, optional
Number of process that will be used for
:class:`~gensim.topic_coherence.text_analysis.ParallelWordOccurrenceAccumulator`
Notes
-----
Boolean sliding window determines word counts using a sliding window. The window
moves over the documents one word token per step. Each step defines a new virtual
document by copying the window content. Boolean document is applied to these virtual
documents to compute word probabilities.
Returns
-------
:class:`~gensim.topic_coherence.text_analysis.WordOccurrenceAccumulator`
if `processes` = 1 OR
:class:`~gensim.topic_coherence.text_analysis.ParallelWordOccurrenceAccumulator`
otherwise. This is word occurrence accumulator instance that can be used to lookup
token frequencies and co-occurrence frequencies.
Examples
---------
.. sourcecode:: pycon
>>> from gensim.topic_coherence import probability_estimation
>>> from gensim.corpora.hashdictionary import HashDictionary
>>>
>>>
>>> texts = [
... ['human', 'interface', 'computer'],
... ['eps', 'user', 'interface', 'system'],
... ['system', 'human', 'system', 'eps'],
... ['user', 'response', 'time'],
... ['trees'],
... ['graph', 'trees']
... ]
>>> dictionary = HashDictionary(texts)
>>> w2id = dictionary.token2id
>>>
>>> # create segmented_topics
>>> segmented_topics = [
... [
... (w2id['system'], w2id['graph']),
... (w2id['computer'], w2id['graph']),
... (w2id['computer'], w2id['system'])
... ],
... [
... (w2id['computer'], w2id['graph']),
... (w2id['user'], w2id['graph']),
... (w2id['user'], w2id['computer'])]
... ]
>>> # create corpus
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> accumulator = probability_estimation.p_boolean_sliding_window(texts, segmented_topics, dictionary, 2)
>>>
>>> (accumulator[w2id['computer']], accumulator[w2id['user']], accumulator[w2id['system']])
(1, 3, 4)
"""
top_ids = unique_ids_from_segments(segmented_topics)
if processes <= 1:
accumulator = WordOccurrenceAccumulator(top_ids, dictionary)
else:
accumulator = ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary)
logger.info("using %s to estimate probabilities from sliding windows", accumulator)
return accumulator.accumulate(texts, window_size)
def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes=1, model=None):
"""Train word2vec model on `texts` if `model` is not None.
Parameters
----------
texts : iterable of iterable of str
Input text
segmented_topics : iterable of iterable of str
Output from the segmentation of topics. Could be simply topics too.
dictionary : :class:`~gensim.corpora.dictionary`
Gensim dictionary mapping of the tokens and ids.
window_size : int, optional
Size of the sliding window.
processes : int, optional
Number of processes to use.
model : :class:`~gensim.models.word2vec.Word2Vec` or :class:`~gensim.models.keyedvectors.KeyedVectors`, optional
If None, a new Word2Vec model is trained on the given text corpus. Otherwise,
it should be a pre-trained Word2Vec context vectors.
Returns
-------
:class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`
Text accumulator with trained context vectors.
Examples
--------
.. sourcecode:: pycon
>>> from gensim.topic_coherence import probability_estimation
>>> from gensim.corpora.hashdictionary import HashDictionary
>>> from gensim.models import word2vec
>>>
>>> texts = [
... ['human', 'interface', 'computer'],
... ['eps', 'user', 'interface', 'system'],
... ['system', 'human', 'system', 'eps'],
... ['user', 'response', 'time'],
... ['trees'],
... ['graph', 'trees']
... ]
>>> dictionary = HashDictionary(texts)
>>> w2id = dictionary.token2id
>>>
>>> # create segmented_topics
>>> segmented_topics = [
... [
... (w2id['system'], w2id['graph']),
... (w2id['computer'], w2id['graph']),
... (w2id['computer'], w2id['system'])
... ],
... [
... (w2id['computer'], w2id['graph']),
... (w2id['user'], w2id['graph']),
... (w2id['user'], w2id['computer'])]
... ]
>>> # create corpus
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> sentences = [
... ['human', 'interface', 'computer'],
... ['survey', 'user', 'computer', 'system', 'response', 'time']
... ]
>>> model = word2vec.Word2Vec(sentences, vector_size=100, min_count=1)
>>> accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2, 1, model)
"""
top_ids = unique_ids_from_segments(segmented_topics)
accumulator = WordVectorsAccumulator(
top_ids, dictionary, model, window=window_size, workers=processes)
return accumulator.accumulate(texts, window_size)
def unique_ids_from_segments(segmented_topics):
"""Return the set of all unique ids in a list of segmented topics.
Parameters
----------
segmented_topics: list of (int, int).
Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers.
Returns
-------
set
Set of unique ids across all topic segments.
Example
-------
.. sourcecode:: pycon
>>> from gensim.topic_coherence import probability_estimation
>>>
>>> segmentation = [[(1, 2)]]
>>> probability_estimation.unique_ids_from_segments(segmentation)
set([1, 2])
"""
unique_ids = set() # is a set of all the unique ids contained in topics.
for s_i in segmented_topics:
for word_id in itertools.chain.from_iterable(s_i):
if hasattr(word_id, '__iter__'):
unique_ids.update(word_id)
else:
unique_ids.add(word_id)
return unique_ids