In [1]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# use the texts with masked LaTeX (see arXiv_shallow.ipynb)

file = os.path.join("data", "notex_all.csv")

data = pd.read_csv(file, delimiter='\t')

In [3]:
n_total = len(data)
n_total

838804

In [4]:
# train-validate-test split
# 500k - 200k - 138k

n_train = 500_000
data_train = data[:n_train]
text_train = data_train.text
label_train = data_train.label

n_val = 200_000
data_val = data[n_train:n_train+n_val]
text_val = data_val.text
label_val = data_val.label

n_test = n_total - n_train - n_val
data_test = data[n_train+n_val:]
text_test = data_test.text
label_test = data_test.label
n_test

138804

In [5]:
# see how many unique words are there globally in the train-set (unique_words)
# and how many words are in the longest text (max_word_count)
# save those annoyingly confusing integers to feed later into keras

count_v = CountVectorizer(min_df = 1, strip_accents='unicode')
word_counts_train = count_v.fit_transform(text_train)

unique_words = word_counts_train.shape[1]
num_words = unique_words + 1
max_word_count = np.max(np.sum(word_counts_train, axis=1))
padded_length = int(1.05*max_word_count)

print(f"Counted {unique_words} unique words in the whole train dataset.")
print(f"Take numbers 1 to 'num_words'={num_words} as numerical labels for the unique words, and 0 for 'silence'.")
print(f"A single record has at most {max_word_count} words.")
print(f"The text will be tokenized into sequences of numerical labels and put into arrays of length \
'padded_length'={padded_length} right-padded with zeros. The extra 5% of length is there in case a longer text \
would need to be tokenized and classified later (e.g. from a test dataset).")

Counted 277303 unique words in the whole train dataset.
Take numbers 1 to 'num_words'=277304 as numerical labels for the unique words, and 0 for 'silence'.
A single record has at most 647 words.
The text will be tokenized into sequences of numerical labels and put into arrays of length 'padded_length'=679 right-padded with zeros. The extra 5% of length is there in case a longer text would need to be tokenized and classified later (e.g. from a test dataset).


In [6]:
global_params = {'unique_words': unique_words, 'num_words': num_words, 'padded_length': padded_length}
global_params

{'unique_words': 277303, 'num_words': 277304, 'padded_length': 679}

In [7]:
tokenizer = Tokenizer(num_words=unique_words, lower=True)
tokenizer.fit_on_texts(text_train)

sequences_train = tokenizer.texts_to_sequences(text_train)
X_train = pad_sequences(sequences_train, maxlen=padded_length, padding='pre', truncating='post')

sequences_val = tokenizer.texts_to_sequences(text_val)
X_val = pad_sequences(sequences_val, maxlen=padded_length, padding='pre', truncating='post')

sequences_test = tokenizer.texts_to_sequences(text_test)
X_test = pad_sequences(sequences_test, maxlen=padded_length, padding='pre', truncating='post')

In [8]:
X_train.shape, X_val.shape, X_test.shape

((500000, 679), (200000, 679), (138804, 679))

In [9]:
label_e = LabelEncoder()
num_label = label_e.fit_transform(label_train)
y_train = to_categorical(num_label)
y_val = to_categorical(label_e.transform(label_val))
y_test = to_categorical(label_e.transform(label_test))

In [10]:
y_train.shape, y_val.shape, y_test.shape

((500000, 6), (200000, 6), (138804, 6))

In [11]:
# label/y translation

print(label_e.inverse_transform(np.unique(num_label)))
print(np.unique(num_label))
print(to_categorical(np.unique(num_label)))

['cs' 'math' 'phys' 'q-bio' 'q-fin' 'stat']
[0 1 2 3 4 5]
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]


In [12]:
# compute class weights for keras (obtained from the whole of train sample)

n_classes = len(np.unique(num_label))
global_params['n_classes'] = n_classes

class_weights = compute_class_weight('balanced', np.unique(num_label), num_label)  # ~ 1 / np.unique(label, return_counts=True)[1]
global_params['class_weights'] = class_weights

In [13]:
pickle.dump(global_params, open("global_params.p", "wb"))
global_params

{'unique_words': 277303,
 'num_words': 277304,
 'padded_length': 679,
 'n_classes': 6,
 'class_weights': array([ 1.26825655,  0.72736371,  0.27602776, 13.23801959, 30.29201502,
         9.49559404])}

In [14]:
# dump all X's and y's to files

# should've created a function for this...
# and used compression...
# the directories need to exist first, I think

f_X_train = os.path.join("data", "Kdata", "X_train.npy")
np.save(f_X_train, X_train)
f_y_train = os.path.join("data", "Kdata", "y_train.npy")
np.save(f_y_train, y_train)

f_X_val = os.path.join("data", "Kdata", "X_val.npy")
np.save(f_X_val, X_val)
f_y_val = os.path.join("data", "Kdata", "y_val.npy")
np.save(f_y_val, y_val)

f_X_test = os.path.join("data", "Kdata", "X_test.npy")
np.save(f_X_test, X_test)
f_y_test = os.path.join("data", "Kdata", "y_test.npy")
np.save(f_y_test, y_test)                         

In [55]:
np.load(os.path.join("data", "Kdata", "X_train.npy"))

array([[   13,     1,  2082, ...,     0,     0,     0],
       [ 6166,  2340,  6114, ...,     0,     0,     0],
       [ 5375,    13,    79, ...,     0,     0,     0],
       ...,
       [    4,    56,  2290, ...,     0,     0,     0],
       [ 2841,     2,     1, ...,     0,     0,     0],
       [12727,     2,   283, ...,     0,     0,     0]])

---

### End