In [1]:
import re
import os
import numpy as np

from gensim.models import Word2Vec, KeyedVectors

In [7]:
n_train = 500_000
DATA_FILE = os.path.join("data", "notex_all.csv")
GENSIM_WORD_VEC_FILE = os.path.join("gensim", f"w2v_{n_train//1000}k_notex_records.kv")
EMBEDD_FILE = os.path.join("gensim", "embedd_weights.npy")
X_FILE = os.path.join("gensim", "gensim_embedd_X.txt")
LABELS_FILE = os.path.join("gensim", "labels.txt")
WORD_PATTERN = re.compile('\w[\w\'`]+')
EMBEDD_DIM = 300

In [11]:
# A python iterator for gensim's Word2Vec class
# It spits out elements that are lists of words
# Each list corresponds to one arXiv article
# e.g. next(texts_iter) = ['hello', 'world']

class texts_iter():
    
    def __init__(self, filename, nrows=float('inf'), skiprows=1,\
                 put_labels_aside=False, labels_file=None):
        
        self.idx = 0
        self.nrows = nrows
        self.records = open(filename, "r")
        self.put_labels_aside = put_labels_aside
        if put_labels_aside:
            self.labels_file = open(labels_file, "w")
        
        #skip rows, default = 1-line header
        for _ in range(skiprows):
            next(self.records)
    
    def finish(self):
        self.records.close()
        if self.put_labels_aside:
            self.labels_file.close()


    # the defining method, returns a list
    def __next__(self):
        
        # dont read beyond 'nrows'
        if self.idx < self.nrows:  
            self.idx += 1
            
            try:
                # record == text \t label
                record = next(self.records).split('\t')
            except:
                self.finish()
                raise StopIteration()
            
            if self.put_labels_aside:
                label = record[1]
                self.labels_file.write(label)
                
            text = record[0].lower()
            words = WORD_PATTERN.findall(text)
            return words
        
        else:
            self.finish()
            raise StopIteration()
    
    def __iter__(self):
        return self

In [4]:
# apply Word2Vec

word2vec = Word2Vec(texts_iter(DATA_FILE, nrows=n_train),\
                    size=EMBEDD_DIM,\
                    min_count=1, sorted_vocab=1).wv

# word2vec.save(GENSIM_WORD_VEC_FILE)

In [127]:
# # load the gensim's "wv" object from the file
# word2vecKV = KeyedVectors.load(WORD_VEC_TRAIN_FILE, mmap='r')

In [5]:
# The resulting embedding and the mapping from words to indeces
# are contained in this wod2vec object
# Note that one needs the map along with the embedding to properly
# tokenize the texts

embedding = np.asarray(word2vec.vectors)
print(embedding.shape)

def word_to_idx(word):
    try:
        return word2vec.vocab[word].index + 1
    except:
        return 0

(299304, 300)


In [9]:
word_to_idx('the'), word_to_idx('świerszcz')

(1, 0)

In [10]:
# save the embedding weights to file
np.save(EMBEDD_FILE, embedding)

In [19]:
# create new files:
# one with the texts converted to sequences of integer tokens
# and the second with unprocessed labels,
# both using the texts_iter class

with open(X_FILE, 'w') as x_file:
    
    # generate word-lists using the same iterator as for
    # creating the embedding
    word_lists = texts_iter(DATA_FILE,\
                            put_labels_aside=True, labels_file=LABELS_FILE)
    
    for list_ in word_lists:
        
        sequence = [str(word_to_idx(word)) for word in list_]
        x_file.write(" ".join(sequence) + '\n')