In [1]:
!which python

/home/mritter/anaconda3/envs/tf_gpu_test04/bin/python


In [2]:
! conda list tensorflow-gpu 

# packages in environment at /home/mritter/anaconda3/envs/tf_gpu_test04:
#
# Name                    Version                   Build  Channel
tensorflow-gpu            1.5.0                         0  
tensorflow-gpu-base       1.5.0            py36h8a131e3_0  


In [3]:
# IF IT DOES NOT WORK, MAY NEED TO RESTART COMPUTER

# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
assert 'GPU' in str(device_lib.list_local_devices())

# confirm Keras sees the GPU
from keras import backend
assert len(backend.tensorflow_backend._get_available_gpus()) > 0

# confirm PyTorch sees the GPU
from torch import cuda
assert cuda.is_available()
assert cuda.device_count() > 0
print(cuda.get_device_name(cuda.current_device()))

InternalError: CUDA runtime implicit initialization on GPU:0 failed. Status: out of memory

# Reload data

In [7]:
import numpy as np
import h5py
with h5py.File('data/padded_data.h5','r') as h5f:
    data = h5f['dataset_1'][:]
with h5py.File('data/labels.h5','r') as h5f:
    labels = h5f['dataset_1'][:]

In [6]:
data

array([[  0,   0,   0, ...,   1,   5,  14],
       [  0,   0,   0, ...,  44,   2,  14],
       [  0,   0,   0, ...,   7,  14,  15],
       ...,
       [  0,   0,   0, ...,  20,  14,   1],
       [  0,   0,   0, ..., 284, 186,   8],
       [  0,   0,   0, ...,   3,  76,  64]], dtype=int32)

In [8]:
# split the data into a training set and a validation set
VALIDATION_SPLIT = 0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]


In [9]:
%%time
# This is actually super fast
# first, build index mapping words in the embeddings set
# to their embedding vector
import os 
BASE_DIR = '/home/mritter/code/twitter_nlp/newsgroups_data/'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove')

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.
CPU times: user 8.26 s, sys: 196 ms, total: 8.46 s
Wall time: 8.46 s


In [11]:
%%time 
# prepare embedding matrix
from keras.preprocessing.text import Tokenizer

num_distinct_words = len(tokenizer.word_index) + 1  # For <UNKNOWN> 
EMBEDDING_DIM = 100  # Dimensions to represent each token

embedding_matrix = np.zeros((num_distinct_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > num_distinct_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Using TensorFlow backend.


NameError: name 'tokenizer' is not defined

In [14]:
import h5py
import numpy as np

with h5py.File('data/whole_data.h5', 'r') as h5f:
    embedding_matrix = h5f['embedding_matrix'][:]
    xtrain = h5f['x_train'][:]
    ytrain = h5f['y_train'][:]
    x_val = h5f['x_val'][:]
    y_val = h5f['y_val'][:]


<Closed HDF5 file>

In [16]:
embedding_matrix.shape

(57664, 100)

In [18]:
xtrain.shape

(47578, 500)

In [36]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model
from keras.initializers import Constant

num_distinct_words, EMBEDDING_DIM = embedding_matrix.shape
embedding_layer = Embedding(num_distinct_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=xtrain.shape[1],
                            trainable=False)

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(xtrain.shape[1],), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)
# model.add(Dense(1, activation='sigmoid'))
# x = Conv1D(128, 5, activation='relu')(embedded_sequences)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])



In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 500, 100)          5766400   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 130       
Total params: 5,908,546
Trainable params: 142,146
Non-trainable params: 5,766,400
____________________________________________________________

In [30]:
# import keras.backend as K
# K.clear_session() 

In [None]:
# Create a TensorBoard instance with the path to the logs directory
from time import time
from keras.callbacks import TensorBoard as tb
from datetime import datetime
t = datetime.now()
tensorboard = tb(log_dir='tensorboard_logs/{:%Y-%m-%d-%H-%M}'.format(t))

model.fit(xtrain, ytrain,
          batch_size=64, #128,
          epochs=10,
          validation_data=(x_val, y_val),
          callbacks=[tensorboard])


Train on 47578 samples, validate on 11894 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10