In [8]:
import os
import numpy as np
import pandas as pd
import pickle
import re

from sklearn.preprocessing import LabelEncoder

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
embeddings_index = {}

with open(os.path.join('glove.6B', 'glove.6B.100d.txt'), encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [3]:
# use the texts with masked LaTeX (see arXiv_shallow.ipynb)

with open(os.path.join("data", "notex_all.csv")) as file:
    data = pd.read_csv(file, delimiter='\t')

In [11]:
n_train = 500_000
data_train = data[:n_train]
text_train = data_train.text
label_train = data_train.label

In [44]:
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(text_train)

In [45]:
EMBEDDING_DIM = 100

In [48]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

In [50]:
len(embedding_matrix)

293075

In [52]:
np.sum(np.sum(embedding_matrix, axis=1) != 0)

89582

In [55]:
pickle.dump(embedding_matrix, open("GloVe_my_weights.p", "wb")) 

In [56]:
embedding_matrix.shape

(293075, 100)

In [77]:
MAX_SEQUENCE_LENGTH = int(1.05 * np.max(np.vectorize(len)(sequences_train)))

In [98]:
sequences_train = tokenizer.texts_to_sequences(text_train)
X_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [99]:
from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Dense
from keras import Model

embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [100]:
label_e = LabelEncoder()
num_label = label_e.fit_transform(label_train)
y_train = to_categorical(num_label)

n_classes = y_train.shape[1]
n_classes

6

In [101]:
X_train[0]

array([   13,     1,  2082,    38,   148,   201,     3,   999,     5,
         266,   201,   208,     7,   247,     1,   148,   201,   999,
        1306,     5,   524,   266,   201,   208,  2136,    13,  4052,
           2,  2087,   291,  5057,  3217,   161,     7,    83,    11,
           5,    57,     6,  1814,   578,    81,     2,     1,  5057,
        4052,    45,    17,   779,  3764,   320,   148,   201,  3997,
           3,     1,   665,     2,   712,   618,    27,   968,     2,
         148,   201,     3,   268,   999,  1019,    33,     4,   948,
           2,   127,   812,   356,   999,   265,   502,   640,     3,
         268,  2961,     3,  2496,   409,  1991,   999,     3,    42,
        5312,     2,   999,    11,    29,     5,   284,  5138,     2,
           1,   418,   148,   201,   176,    13, 12584,   602,     7,
          30,    11,    45,   999,   582,   284,  5138,     8,  1017,
        3025,     9,     4,   127,   148,   201,   621,   943,   216,
        3312,    36,

In [102]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(n_classes, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
model.fit(X_train, y_train, validation_split=0.1,
          epochs=2, batch_size=128)

Train on 450000 samples, validate on 50000 samples
Epoch 1/2
Epoch 2/2

In [45]:
word_pattern = re.compile('\w[\w\'`]+')

def texts_gen(filename):
    with open(os.path.join("data", filename)) as file:
        next(file) # skipping header
        while True:
            text = next(file).split('\t')[0].lower()
            words = word_pattern.findall(text)
            yield words

my_texts = texts_gen("notex_all.csv")

for _ in range(10):
    print(next(my_texts)[20:27])

['cosmological', 'galaxy', 'formation', 'simulations', 'focusing', 'on', 'progenitors']
['complex', 'projective', 'space', 'where', 'we', 'stratify', 'by']
['spherical', 'fluid', 'shells', 'are', 'discussed', 'the', 'analogy']
['field']
['in', 'vitro', 'model', 'of', 'the', 'pre', 'vascular']
['r_p', 'for', 'the', 'stellar', 'mass', 'of', 'galaxies']
['arise', 'from', 'the', 'action', 'of', 'countable', 'discrete']
['of', 'higher', 'order', 'derivatives', 'on', 'the', 'homogeneous']
['investigated', 'numerically', 'and', 'experimentally', 'to', 'compare', 'our']
['in', 'small', 'dimensions', 'the', 'new', 'results', 'depend']


In [47]:
my_texts_sample = []
for _ in range(100):
    my_texts_sample.append(next(my_texts))

In [48]:
my_texts_sample[:3]

[['observations',
  'of',
  'apparent',
  'superslow',
  'wave',
  'propagation',
  'in',
  'solar',
  'prominences',
  'phase',
  'mixing',
  'of',
  'standing',
  'continuum',
  'alfv',
  'en',
  'waves',
  'and',
  'or',
  'continuum',
  'slow',
  'waves',
  'in',
  'atmospheric',
  'magnetic',
  'structures',
  'such',
  'as',
  'coronal',
  'arcades',
  'can',
  'create',
  'the',
  'apparent',
  'effect',
  'of',
  'wave',
  'propagating',
  'across',
  'the',
  'magnetic',
  'field',
  'we',
  'observe',
  'prominence',
  'with',
  'sdo',
  'aia',
  'on',
  '2015',
  'march',
  '15',
  'and',
  'find',
  'the',
  'presence',
  'of',
  'oscillatory',
  'motion',
  'we',
  'aim',
  'to',
  'demonstrate',
  'that',
  'interpreting',
  'this',
  'motion',
  'as',
  'magneto',
  'hydrodynamic',
  'mhd',
  'wave',
  'is',
  'faulty',
  'we',
  'also',
  'connect',
  'the',
  'decrease',
  'of',
  'the',
  'apparent',
  'velocity',
  'over',
  'time',
  'with',
  'the',
  'phase',
  'm

In [72]:
from gensim.models import Word2Vec, Phrases, KeyedVectors

In [50]:
bigram = Phrases(my_texts_sample)

In [103]:
w2v_1 = Word2Vec(bigram[my_texts_sample[:20]], size=300, min_count=1)

In [104]:
len(w2v_1.wv['the'])

300

In [105]:
w2v_1.wv['the'][:10]

array([ 0.00065569, -0.00088401, -0.00088979, -0.00033883,  0.002294  ,
        0.00070945,  0.00080252,  0.00109801,  0.00317515, -0.00197349],
      dtype=float32)

In [106]:
w2v_1.wv.save("toy_gensim_word2vec.kv")

In [125]:
w2v_1.wv.syn0[100][:10]

  """Entry point for launching an IPython kernel.


array([-0.0011855 ,  0.00105692, -0.00124981, -0.00029519,  0.00181188,
       -0.00128302, -0.00068208,  0.00106538,  0.00014763, -0.00107676],
      dtype=float32)

https://stackoverflow.com/questions/35596031/gensim-word2vec-find-number-of-words-in-vocabulary

In [124]:
w2v_1.wv.index2word[100]

'approximate'

In [126]:
w2v_1.wv.vocab['approximate']

<gensim.models.keyedvectors.Vocab at 0x1381bb9f9e8>

In [127]:
w2v_1.wv['approximate'][:10]

array([-0.0011855 ,  0.00105692, -0.00124981, -0.00029519,  0.00181188,
       -0.00128302, -0.00068208,  0.00106538,  0.00014763, -0.00107676],
      dtype=float32)

In [82]:
w2v_2 = KeyedVectors.load("toy_gensim_word2vec.kv", mmap='r')

In [84]:
w2v_2['the'][:10]

array([ 9.8480273e-04, -8.0193748e-04, -5.6996860e-04, -1.1877532e-03,
        1.5081668e-03,  3.5077956e-04, -4.1254156e-05,  1.1921820e-03,
        2.1408624e-03, -1.5976258e-03], dtype=float32)

In [96]:
w2v_2.vocab

{'observations': <gensim.models.keyedvectors.Vocab at 0x1381bb55e10>,
 'of': <gensim.models.keyedvectors.Vocab at 0x1381bb55f28>,
 'apparent': <gensim.models.keyedvectors.Vocab at 0x1381bb55f60>,
 'superslow': <gensim.models.keyedvectors.Vocab at 0x1381bb55f98>,
 'wave': <gensim.models.keyedvectors.Vocab at 0x1381bb55d30>,
 'propagation': <gensim.models.keyedvectors.Vocab at 0x1381bc35048>,
 'in': <gensim.models.keyedvectors.Vocab at 0x1381bc350b8>,
 'solar': <gensim.models.keyedvectors.Vocab at 0x1381bc35128>,
 'prominences': <gensim.models.keyedvectors.Vocab at 0x1381bc35160>,
 'phase': <gensim.models.keyedvectors.Vocab at 0x1381bc351d0>,
 'mixing': <gensim.models.keyedvectors.Vocab at 0x1381bc35240>,
 'standing': <gensim.models.keyedvectors.Vocab at 0x1381bc35278>,
 'continuum': <gensim.models.keyedvectors.Vocab at 0x1381bc352b0>,
 'alfv': <gensim.models.keyedvectors.Vocab at 0x1381bc35320>,
 'en': <gensim.models.keyedvectors.Vocab at 0x1381bc35390>,
 'waves': <gensim.models.keyedve

In [86]:
w2v_2.syn0[0][:10]

  """Entry point for launching an IPython kernel.


array([ 9.8480273e-04, -8.0193748e-04, -5.6996860e-04, -1.1877532e-03,
        1.5081668e-03,  3.5077956e-04, -4.1254156e-05,  1.1921820e-03,
        2.1408624e-03, -1.5976258e-03], dtype=float32)