In [1]:
import re
import os
import numpy as np
from random import randint

from gensim.models import Word2Vec

# Summary
### Create word embedding using gensim
* The embedding is trained using the subset of first 500k records (regarded as train-data throughout).
* The weights are saved to a file as a raw matrix
* The whole of data (including the part unseen by embedding) is processed and split into X, Y files
    * X contains what was originally texts (lists of words) now zero-padded sequences of integers that correspond to row indeces of the embedding.
    * Y contains a sequence of integer-encoded labels (0:6).
* The zeroth row of the embedding is filled with zeros, and unknown words are encoded with 0.
* The length of the zero-padded sequences is set, based on the longest text seen in train data.

In [2]:
# global settings

n_train = 500_000
DATA_FILE = os.path.join("data", "notex_all.csv")
WORD_PATTERN = re.compile('\w[\w\'`]+')
EMBEDD_FILE = os.path.join("gensim", "embedd_weights.npy")
EMBEDD_DIM = 300
X_FILE = os.path.join("gensim", "embedded_X.txt")
X_FILE_BIN =  os.path.join("gensim", "embedded_X.npy")
Y_FILE = os.path.join("gensim", "encoded_Y.txt")
Y_FILE_BIN =  os.path.join("gensim", "encoded_Y.npy")

In [3]:
# A python iterator for gensim's Word2Vec class
# It spits out elements that are lists of words
# Each list corresponds to one arXiv article
# e.g. next(texts_iter) = ['hello', 'world']

class texts_iter():
    
    def __init__(self, filename, nrows=float('inf'), skiprows=1,\
                 prepend_label=True):
        
        self.idx = 0
        self.nrows = nrows
        self.records = open(filename, "r")
        self.maxlen = 0
        self.prepend_label = prepend_label
        
        #skip rows, default = 1-line header
        for _ in range(skiprows):
            next(self.records)
    
    def finish(self):
        self.records.close()


    # the defining method, returns a list
    def __next__(self):
        
        # don't read beyond 'nrows'
        if self.idx < self.nrows:  
            self.idx += 1
            
            try:
                record, label = next(self.records).split('\t')
            except:
                self.finish()
                raise StopIteration()
            
                
            text = record.lower()
            words = WORD_PATTERN.findall(text)
            self.maxlen = max(len(words), self.maxlen)
            
            if self.prepend_label:
                return [label.strip()] + words
            else:
                return words
        
        else:
            self.finish()
            raise StopIteration()
    
    def __iter__(self):
        return self

In [4]:
# The main step: apply Word2Vec

texts_train = texts_iter(DATA_FILE, nrows=n_train, prepend_label=False)
word2vec = Word2Vec(texts_train,\
                    size=EMBEDD_DIM,\
                    min_count=1, sorted_vocab=1).wv

print(f"Highest wordcount in the train data-set: {texts_train.maxlen}.")

Highest wordcount in the train data-set: 647.


In [5]:
# choose the dimension of texts

PADDED_LEN = int(1.05 * 647)
PADDED_LEN

679

In [15]:
# # The gensim's object may be saved to file
#
# word2vec.save(GENSIM_WORD_VEC_FILE)
# word2vec = KeyedVectors.load(WORD_VEC_TRAIN_FILE, mmap='r')

In [33]:
# The resulting embedding and the mapping from words to indeces
# are contained in this word2vec object
# Note that one needs the map along with the embedding to properly
# tokenize the texts

n, m = word2vec.vectors.shape
embedding = np.zeros((n+1, m))
embedding[1:] = np.asarray(word2vec.vectors)
print(embedding.shape)

def word_to_idx(word):
    try:
        return word2vec.vocab[word].index + 1
    except:
        return 0

(299305, 300)


In [7]:
word_to_idx('the'), word_to_idx('świerszcz')

(1, 0)

In [34]:
# save the embedding weights to file
np.save(EMBEDD_FILE, embedding)

In [9]:
# create new files:
# one with the texts converted to sequences of integer tokens (0-padded)
# and the second with integer-encoded labels,
# both using the texts_iter class
# Note that the tokenization is applied to the whole of data
# including test-data (so don't look inside)

label_encoding = {}

with open(X_FILE, 'w') as x_file:
    with open(Y_FILE, 'w') as y_file:

        # generate word-lists using the same iterator as for
        # creating the embedding
        records = texts_iter(DATA_FILE, prepend_label=True)

        for list_ in records:
            
            label, *text = list_
            
            if label in label_encoding:
                y = label_encoding[label.strip()]
            else:
                y = len(label_encoding)
                label_encoding[label] = y
                
            y_file.write(str(y) + '\n')
            
            sequence = [str(word_to_idx(word)) for word in text]
            
            # add post-padding
            sequence += [str(0)] * (PADDED_LEN - len(sequence))
            sequence = sequence[:PADDED_LEN]
            
            x_file.write(" ".join(sequence) + '\n')
            
label_encoding

{'phys': 0, 'math': 1, 'q-bio': 2, 'stat': 3, 'q-fin': 4, 'cs': 5}

In [10]:
X = np.loadtxt(X_FILE)
y = np.loadtxt(Y_FILE)

In [11]:
np.save(X_FILE_BIN, X)
np.save(Y_FILE_BIN, y)

In [None]:
# one day it may be fun to randomly sprinkle zeros
# (as if missing words) to our train data

# # suboptimal function to randomly insert zeros into a list
# # the number of zeros is proportional to the length of the list

# def sprinkle_zeros(list_, frac):
#     k = int(frac * len(list_))
#     for _ in range(k):
#         list_.insert(randint(0,len(list_)), 0)

# li = list(range(1,20))
# sprinkle_zeros(li, 0.1)
# li