-
Notifications
You must be signed in to change notification settings - Fork 1
/
word2vec.py
200 lines (179 loc) · 9.06 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import sys
import os
import argparse
import time
import gensim
import random
sys.path.insert(0, '../markov/')
import markov_python3
import numpy as np
import scipy.spatial.distance
class Sentence(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
line_low = line.lower()
yield line_low.split()
def process_arguments(args):
parser = argparse.ArgumentParser(description='configure the irc clients')
parser.add_argument('--path', action='store', help='the path to a folder containing text files')
parser.add_argument('--google_path', action='store', help='the path to the pretrained google model')
parser.add_argument('--method', action='store', help='the test function to use')
params = vars(parser.parse_args(args))
return params
def avg_feature_vector(words, model, num_features):
# function to average all words vectors in a given paragraph
featureVec = np.zeros((num_features,), dtype="float64")
nwords = 0
# list containing names of words in the vocabulary
# index2word_set = set(model.index2word) this is moved as input param for performance reasons
for word in words:
if word in model.vocab:
nwords = nwords+1
featureVec = np.add(featureVec, model[word])
#else:
# print('not in vocabulary: ' + word)
if nwords > 0:
featureVec = np.divide(featureVec, nwords)
return featureVec
def first_testing(model_selftrained, features):
sentence_a = 'This attribution is putting the other in a condition of authority and assurance'
sentence_b = 'condition of a superior joke, that by rise up is to say the promotion of new statuses, new powers.'
sentence_c = 'Feels stange, of course, being perceived as not human.'
sentence_d = 'of human expression'
sentence_e = 'This joke is putting other into a weird condition'
sentences = [sentence_a, sentence_b, sentence_c, sentence_d, sentence_e]
for index, sentence in enumerate(sentences):
_sentence = sentence.replace('.', '')
_sentence = _sentence.lower()
sentences[index] = _sentence.replace(',', '')
sentence_a_vec = avg_feature_vector(sentences[0].split(), model=model_selftrained, num_features=features)
sentence_b_vec = avg_feature_vector(sentences[1].split(), model=model_selftrained, num_features=features)
sentence_c_vec = avg_feature_vector(sentences[2].split(), model=model_selftrained, num_features=features)
sentence_d_vec = avg_feature_vector(sentences[3].split(), model=model_selftrained, num_features=features)
sentence_e_vec = avg_feature_vector(sentences[4].split(), model=model_selftrained, num_features=features)
sena_senb_similarity = 1 - scipy.spatial.distance.cosine(sentence_a_vec, sentence_b_vec)
sena_self_similarity = 1 - scipy.spatial.distance.cosine(sentence_a_vec, sentence_a_vec)
sena_sene_similarity = 1 - scipy.spatial.distance.cosine(sentence_a_vec, sentence_e_vec)
sena_send_similarity = 1 - scipy.spatial.distance.cosine(sentence_a_vec, sentence_d_vec)
print(sena_senb_similarity)
print(sena_self_similarity)
print(sena_sene_similarity)
print(sena_send_similarity)
def second_training(google_model, path, features):
lines = []
for fname in os.listdir(path):
for line in open(os.path.join(path, fname)):
line_low = line.lower()
lines.append(line_low)
log = []
print('Collected ' + str(len(lines)) + ' lines.')
t0 = time.time()
for i in range(1000000):
random_a = random.choice(lines)
random_b = random.choice(lines)
random_a_vec = avg_feature_vector(random_a.split(), model=google_model, num_features=features)
random_b_vec = avg_feature_vector(random_b.split(), model=google_model, num_features=features)
similarity = 1 - scipy.spatial.distance.cosine(random_a_vec, random_b_vec)
log.append((random_a, random_b, similarity))
t1 = time.time()
print('calculating all vectors took ' + str(t1-t0) + 's')
log.sort(key=lambda log: log[2], reverse=True)
print('Best results:')
for i in range(30):
print('Index: ' + str(i))
print(log[i][0])
print(log[i][1])
print('with similarity: ' + str(log[i][2]))
def train_markovs(path, max_markov=30):
markovs = []
for fname in os.listdir(path):
if len(markovs) > max_markov:
break
print('Start training markov from ' + fname)
markov_chain = markov_python3.Markov(prefix=fname)
line_count = 0
for line in open(os.path.join(path, fname)):
line_low = line.lower()
markov_chain.add_line_to_index(line_low.split())
line_count += 1
print('Done training markov from ' + fname)
if line_count > 200:
markovs.append(markov_chain)
return markovs
def third_testing(path, google_path, features):
markovs = train_markovs(path=path, max_markov=120)
print('Done Training Markovs')
model = gensim.models.Word2Vec.load_word2vec_format(google_path, binary=True)
print('Done loading Google model')
model_selftrained = gensim.models.Word2Vec(Sentence(path), min_count=5, size=features, workers=8)
print('Done training own model')
_t0 = time.time()
# loading all lines for comparison
lines_vectors_google = []
lines_vectors_own = []
for fname in os.listdir(path):
for line in open(os.path.join(path, fname)):
line_low = line.lower()
vector_google = avg_feature_vector(line_low.split(), model=model_selftrained, num_features=features)
vector_own = avg_feature_vector(line_low.split(), model=model, num_features=features)
lines_vectors_google.append((line_low, vector_google))
lines_vectors_own.append((line_low, vector_own))
_t1 = time.time()
print('Calculating all vectors on google/own models for sentences from own corpus done')
print('That took ' + str(int(_t1-_t0)) + 's. It was done for ' + str(len(lines_vectors_google)) + ' lines')
markov_dict = {}
for markov in markovs:
generated_sentences = []
print('----------------------------------------')
print(markov.prefix)
for i in range(10):
t0 = time.time()
sentence = markov.generate(max_words=100)
sentence_vec_google = avg_feature_vector(' '.join(sentence).lower().split(), model=model_selftrained, num_features=features)
sentence_vec_own = avg_feature_vector(' '.join(sentence).lower().split(), model=model, num_features=features)
# iterating through all vectors of all existing text lines from our corpus
biggest_similarity_google = 0.0
biggest_similarity_sentence_google = ''
biggest_similarity_own = 0.0
biggest_similarity_sentence_own = ''
for index_corpus_line in range(len(lines_vectors_google)):
vec_google = lines_vectors_google[index_corpus_line][1]
vec_own = lines_vectors_own[index_corpus_line][1]
similarity_google = 1 - scipy.spatial.distance.cosine(vec_google, sentence_vec_google)
similarity_own = 1 - scipy.spatial.distance.cosine(vec_own, sentence_vec_own)
if similarity_google > biggest_similarity_google:
biggest_similarity_google = similarity_google
biggest_similarity_sentence_google = lines_vectors_google[index_corpus_line][0]
if similarity_own > biggest_similarity_own:
biggest_similarity_own = similarity_own
biggest_similarity_sentence_own = lines_vectors_own[index_corpus_line][0]
t1 = time.time()
print('----------------------------------------')
print('markov: ' + ' '.join(sentence))
print('closest via google: ' + biggest_similarity_sentence_google)
print('closest via own model: ' + biggest_similarity_sentence_own)
#print('Calculating this took: ' + str(int(t1-t0)) + 's')
generated_sentences.append(sentence)
markov_dict[markov] = generated_sentences
if __name__ == '__main__':
params = process_arguments(sys.argv[1:])
features = 300
method = params['method']
path = params['path']
google_path = params['google_path']
if method in 'first':
sentences = Sentence(path)
model_selftrained = gensim.models.Word2Vec(sentences, min_count=5, size=features, workers=8)
first_testing(model_selftrained, features=features)
elif method in 'google':
t0 = time.time()
model = gensim.models.Word2Vec.load_word2vec_format(google_path, binary=True)
t1 = time.time()
print('Loading the google model took ' + str(t1-t0) + 's')
second_training(google_model=model, path=path, features=features)
elif method in 'markov':
# loading google model and own model (for comparing)
third_testing(path=path, google_path=google_path, features=features)