-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_text_vectors.py
34 lines (28 loc) · 1.02 KB
/
preprocess_text_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import numpy as np
# Parameters
ROOT_PATH = '/media/HDD_2TB/DATASETS/'
base_path = ROOT_PATH + 'cnn_polarity/DATA/fasttext_embeddings/'
vectors_path = base_path + 'wiki.en.vec'
dest_file = 'fasttext.en'
def glove2npy(glove_path, base_path_save, dest_file):
vecs_dict = dict()
print "Loading vectors from %s" % (glove_path)
glove_vectors = [x[:-1] for x in open(glove_path).readlines()]
n_vecs = len(glove_vectors)
print "Found %d vectors in %s" % (n_vecs, glove_path)
i = 0
for vector in glove_vectors:
v = vector.split()
word = v[0]
vec = np.asarray(v[1:], dtype='float32')
vecs_dict[word] = vec
i += 1
if i % 1000 == 0:
print "Processed", i, "vectors (", 100 * float(i) / n_vecs, "%)\r",
print
# Store dict
print "Saving word vectors in %s" % (base_path_save + '/' + dest_file + '.npy')
np.save(base_path_save + '/' + dest_file + '.npy', vecs_dict)
print
if __name__ == "__main__":
glove2npy(vectors_path, base_path, dest_file)