-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
test_keras_integration.py
150 lines (124 loc) · 5.89 KB
/
test_keras_integration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import unittest
import numpy as np
from gensim.models import word2vec
try:
from sklearn.datasets import fetch_20newsgroups
except ImportError:
raise unittest.SkipTest("Test requires sklearn to be installed, which is not available")
try:
import keras
from keras.engine import Input
from keras.models import Model
from keras.layers.merge import dot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Flatten
from keras.layers import Conv1D, MaxPooling1D
except ImportError:
raise unittest.SkipTest("Test requires Keras to be installed, which is not available")
from gensim.test.utils import common_texts
class TestKerasWord2VecWrapper(unittest.TestCase):
def setUp(self):
self.model_cos_sim = word2vec.Word2Vec(common_texts, size=100, min_count=1, hs=1)
self.model_twenty_ng = word2vec.Word2Vec(min_count=1)
def testWord2VecTraining(self):
"""
Test word2vec training.
"""
model = self.model_cos_sim
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 100))
self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 100))
sims = model.most_similar('graph', topn=10)
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
# test querying for "most similar" by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims2 = model.most_similar(positive=[graph_vector], topn=11)
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims2)
def testEmbeddingLayerCosineSim(self):
"""
Test Keras 'Embedding' layer returned by 'get_embedding_layer' function for a simple word similarity task.
"""
keras_w2v_model = self.model_cos_sim
keras_w2v_model_wv = keras_w2v_model.wv
embedding_layer = keras_w2v_model_wv.get_keras_embedding()
input_a = Input(shape=(1,), dtype='int32', name='input_a')
input_b = Input(shape=(1,), dtype='int32', name='input_b')
embedding_a = embedding_layer(input_a)
embedding_b = embedding_layer(input_b)
similarity = dot([embedding_a, embedding_b], axes=2, normalize=True)
model = Model(input=[input_a, input_b], output=similarity)
model.compile(optimizer='sgd', loss='mse')
word_a = 'graph'
word_b = 'trees'
output = model.predict([
np.asarray([keras_w2v_model.wv.vocab[word_a].index]),
np.asarray([keras_w2v_model.wv.vocab[word_b].index])
])
# output is the cosine distance between the two words (as a similarity measure)
self.assertTrue(type(output[0][0][0]) == np.float32) # verify that a float is returned
def testEmbeddingLayer20NewsGroup(self):
"""
Test Keras 'Embedding' layer returned by 'get_embedding_layer' function
for a smaller version of the 20NewsGroup classification problem.
"""
MAX_SEQUENCE_LENGTH = 1000
# Prepare text samples and their labels
# Processing text dataset
texts = [] # list of text samples
texts_w2v = [] # used to train the word embeddings
labels = [] # list of label ids
data = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'comp.graphics', 'sci.space'])
for index in range(len(data)):
label_id = data.target[index]
file_data = data.data[index]
i = file_data.find('\n\n') # skip header
if i > 0:
file_data = file_data[i:]
try:
curr_str = str(file_data)
sentence_list = curr_str.split('\n')
for sentence in sentence_list:
sentence = (sentence.strip()).lower()
texts.append(sentence)
texts_w2v.append(sentence.split(' '))
labels.append(label_id)
except Exception:
pass
# Vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
# word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
x_train = data
y_train = labels
# prepare the embedding layer using the wrapper
keras_w2v = self.model_twenty_ng
keras_w2v.build_vocab(texts_w2v)
keras_w2v.train(texts, total_examples=keras_w2v.corpus_count, epochs=keras_w2v.iter)
keras_w2v_wv = keras_w2v.wv
embedding_layer = keras_w2v_wv.get_keras_embedding()
# create a 1D convnet to solve our classification task
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x) # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(y_train.shape[1], activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
fit_ret_val = model.fit(x_train, y_train, epochs=1)
# verify the type of the object returned after training
# value returned is a `History` instance.
# Its `history` attribute contains all information collected during training.
self.assertTrue(type(fit_ret_val) == keras.callbacks.History)
if __name__ == '__main__':
unittest.main()