In [96]:
import os
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import LabelEncoder

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [8]:
embeddings_index = {}

with open(os.path.join('glove.6B', 'glove.6B.100d.txt'), encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [9]:
# use the texts with masked LaTeX (see arXiv_shallow.ipynb)

with open(os.path.join("data", "notex_all.csv")) as file:
    data = pd.read_csv(file, delimiter='\t')

In [11]:
n_train = 500_000
data_train = data[:n_train]
text_train = data_train.text
label_train = data_train.label

In [44]:
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(text_train)

In [45]:
EMBEDDING_DIM = 100

In [48]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

In [50]:
len(embedding_matrix)

293075

In [52]:
np.sum(np.sum(embedding_matrix, axis=1) != 0)

89582

In [55]:
pickle.dump(embedding_matrix, open("GloVe_my_weights.p", "wb")) 

In [56]:
embedding_matrix.shape

(293075, 100)

In [77]:
MAX_SEQUENCE_LENGTH = int(1.05 * np.max(np.vectorize(len)(sequences_train)))

In [98]:
sequences_train = tokenizer.texts_to_sequences(text_train)
X_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [99]:
from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Dense
from keras import Model

embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [100]:
label_e = LabelEncoder()
num_label = label_e.fit_transform(label_train)
y_train = to_categorical(num_label)

n_classes = y_train.shape[1]
n_classes

6

In [101]:
X_train[0]

array([   13,     1,  2082,    38,   148,   201,     3,   999,     5,
         266,   201,   208,     7,   247,     1,   148,   201,   999,
        1306,     5,   524,   266,   201,   208,  2136,    13,  4052,
           2,  2087,   291,  5057,  3217,   161,     7,    83,    11,
           5,    57,     6,  1814,   578,    81,     2,     1,  5057,
        4052,    45,    17,   779,  3764,   320,   148,   201,  3997,
           3,     1,   665,     2,   712,   618,    27,   968,     2,
         148,   201,     3,   268,   999,  1019,    33,     4,   948,
           2,   127,   812,   356,   999,   265,   502,   640,     3,
         268,  2961,     3,  2496,   409,  1991,   999,     3,    42,
        5312,     2,   999,    11,    29,     5,   284,  5138,     2,
           1,   418,   148,   201,   176,    13, 12584,   602,     7,
          30,    11,    45,   999,   582,   284,  5138,     8,  1017,
        3025,     9,     4,   127,   148,   201,   621,   943,   216,
        3312,    36,

In [102]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(n_classes, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
model.fit(X_train, y_train, validation_split=0.1,
          epochs=2, batch_size=128)

Train on 450000 samples, validate on 50000 samples
Epoch 1/2
Epoch 2/2