-
-
Notifications
You must be signed in to change notification settings - Fork 111
/
embedding_idf.py
124 lines (104 loc) · 3.93 KB
/
embedding_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Copyright 2019 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from math import log
import numpy as np
import logging
try:
import tensorflow as tf
except ImportError:
raise ImportError("Install tensorflow package (`pip install tensorflow`)"
" to use 'embedding-idf' model.")
try:
tf.logging.set_verbosity(tf.logging.ERROR)
except AttributeError:
logging.getLogger("tensorflow").setLevel(logging.ERROR)
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from asreview.feature_extraction.embedding_lstm import load_embedding
from asreview.feature_extraction.base import BaseFeatureExtraction
from asreview.utils import get_random_state
class EmbeddingIdf(BaseFeatureExtraction):
"""Class for Embedding-Idf model.
This model averages the weighted word vectors of all the words in the text,
in order to get a single feature vector for each text. The weights are
provided by the inverse document frequencies.
"""
name = "embedding-idf"
def __init__(self, *args, embedding_fp=None, random_state=None, **kwargs):
"""Initialize the Embedding-Idf model
Arguments
---------
embedding_fp: str
Path to embedding.
"""
super(EmbeddingIdf, self).__init__(*args, **kwargs)
self.embedding_fp = embedding_fp
self.embedding = None
self._random_state = get_random_state(random_state)
def transform(self, texts):
if self.embedding is None:
if self.embedding_fp is None:
raise ValueError(
"Error: need embedding to train Embeddingdf model.")
self.embedding = load_embedding(self.embedding_fp, n_jobs=-1)
text_counts = _get_freq_dict(texts)
idf = _get_idf(text_counts)
X = _get_X_from_dict(text_counts, idf, self.embedding,
self._random_state)
return X
def _get_freq_dict(all_text):
text_dicts = []
for text in all_text:
cur_dict = {}
word_sequence = text_to_word_sequence(text)
for word in word_sequence:
if word in cur_dict:
cur_dict[word] += 1
else:
cur_dict[word] = 1
text_dicts.append(cur_dict)
return text_dicts
def _get_idf(text_dicts):
all_count = {}
for text in text_dicts:
for word in text:
if word in all_count:
all_count[word] += 1
else:
all_count[word] = 1
idf = {}
for word in all_count:
idf[word] = log(len(text_dicts)/all_count[word])
return idf
def _get_X_from_dict(text_dicts, idf, embedding, random_state):
n_vec = len(embedding[list(embedding.keys())[0]])
X = np.zeros((len(text_dicts), n_vec))
for i, text in enumerate(text_dicts):
text_vec = None
for word in text:
cur_count = text[word]
cur_idf = idf[word]
cur_vec = embedding.get(word, None)
if cur_vec is None:
continue
if text_vec is None:
text_vec = cur_vec*cur_idf*cur_count
else:
text_vec += cur_vec*cur_idf*cur_count
if text_vec is None:
text_vec = random_state.random(n_vec)
text_norm = np.linalg.norm(text_vec)
if abs(text_norm) > 1e-7:
text_vec /= np.linalg.norm(text_vec)
X[i] = text_vec
return X