In [21]:
import numpy as np

In [22]:
def get_glove_vocab(filename):
    """Load vocab from file

    Args:
        filename: path to the glove vectors (xxx.txt)

    Returns:
        vocab_set: set() of strings
    """
    print("Building vocab...")
    vocab_set = set()
    with open(filename) as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab_set.add(word)
    print("- done. {} tokens".format(len(vocab_set)))
    return vocab_set

def write_vocab(vocab, filename):
    """Writes a vocab to a file

    Writes one word per line.

    Args:
        vocab: iterable that yields word
        filename: path to vocab file

    Returns:
        write a word per line (xxx.txt)

    """
    print("Writing vocab...")
    with open(filename, "w") as f:
        for i, word in enumerate(vocab):
            if i != len(vocab) - 1:
                f.write("{}\n".format(word))
            else:
                f.write(word)
    print("- done. {} tokens".format(len(vocab)))


def load_vocab_and_return_word_to_id_dict(filename):
    """Loads vocab from a file

    Args:
        filename: (string) the format of the file must be one word per line.

    Returns:
        d: dict[word] = index

    """
    try:
        d = dict()
        with open(filename) as f:
            for idx, word in enumerate(f):
                word = word.strip()
                d[word] = idx
    
    except IOError:
        raise MyIOError(filename)
    return d

def export_glove_vectors(vocab, glove_filename, output_filename, dim):
    """Saves glove vectors in numpy array

    Args:
        vocab: dictionary vocab[word] = index
        glove_filename: a path to a glove file
        trimmed_filename: a path where to store a matrix in npy
        dim: (int) dimension of embeddings
    
    Note:
        - Unknown words have representations with values of zero
    """
    embeddings = np.zeros([len(vocab), dim]) # initialize a zero
    with open(glove_filename) as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            if word in vocab:
                word_idx = vocab[word]
                # np.array (by default) will make a copy of the object, while asarray will not unless necessary.
                embeddings[word_idx] = np.asarray(embedding) # it's better for memory.
                
    # save several arrays into a single file in uncompressed .npz format.
    np.savez_compressed(output_filename, embeddings = embeddings)

In [5]:
vocab_glove = get_glove_vocab('/data/ID_largewv_300_2.txt')


Building vocab...
- done. 6629250 tokens


In [44]:
vocab_words = {'A'}

In [45]:
vocab = vocab_words & vocab_glove

In [46]:
vocab

set()

In [11]:
filename_words_voc = "../models/data/words_vocab.txt"
write_vocab(vocab, filename_words_voc)

Writing vocab...
- done. 6629250 tokens


In [16]:
word_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_words_voc)

In [28]:
import time

dim_word = 300
filename_words_voc = "../models/data/word2vec.npz".format(dim_word)
s = time.time()
export_glove_vectors(word_to_id_dict, glove_filename = '/data/ID_largewv_300_2.txt',
                     output_filename = filename_words_voc, dim = dim_word)
e = time.time()
print (e-s)

1401.0644762516022


In [29]:
1401.0644762516022 /60.0

23.351074604193368

In [31]:
vocab_glove

{'flm04419',
 'btvo',
 'f25mo',
 'ikiweecorner',
 'vs20623',
 'strongkangenwaterjogja',
 'beningwarna',
 'taspalembangimport',
 'ss213428',
 'strollerm',
 'au7h',
 'nyalala',
 'afifahmom_shop',
 'neuromarketing',
 'carseatmothercare',
 'glm1124',
 'milebaby',
 'pashmm',
 'xc9399',
 'bt10123',
 'pmxg',
 'rockbors',
 '19363id',
 'bogaerts',
 'custometoll',
 'batldel33',
 'melawaninfeksi',
 'es3270',
 'vbno',
 'midrohdgrey',
 'lc41661',
 'cip830',
 'dd142',
 'vs10173',
 'zzaz',
 '32651bk',
 'pu',
 'chrisye',
 'mhr016uob',
 'metalld46s4',
 'owda',
 'effron',
 'jualclutchcantik',
 'bm10606',
 'ke95498',
 'sunshime',
 'be1836',
 't5babc',
 'taskerenbatam',
 'flx623',
 'chokerkoreafashion',
 '0gga',
 'capasitaspowerbank',
 'sur791',
 '6kyt',
 'solarchargercontroller',
 'segeralah',
 'qvtp',
 'spainia',
 'mobilephonemicrophone',
 '00d00j',
 'cr1025',
 '54866',
 'l3nu',
 'sf65',
 'ranselsekolah',
 'alpuat',
 '305001846',
 '3qt2',
 'cs736',
 '002759',
 'collarharness',
 'sepisifikasi',
 'w3q3',


In [32]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [35]:
NONE in vocab_glove

False

In [38]:
def get_glove_vectors(filename):
    """
    Args:
        filename: path to the npz file

    Returns:
        matrix of embeddings (np array)

    """
    try:
        with np.load(filename) as data:
            return data["embeddings"]

    except IOError:
        raise MyIOError(filename)


use_pretrained = True
embeddings = (get_glove_vectors(filename_words_voc)
                if use_pretrained else None)

In [40]:
embeddings.shape

(6629250, 300)

In [43]:
np.load(filename_words_voc).files

['embeddings']