In [1]:
#cell-width control
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Imports

In [2]:
#packages
import numpy
import tensorflow as tf
from tensorflow.core.example import example_pb2

#utils
import os
import random
import pickle
import struct
import time
from generators import *

#keras
import keras
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.models import load_model
from keras.layers import Dense, Dropout, Activation, Concatenate, Dot, Embedding, LSTM, Conv1D, MaxPooling1D, Input, Lambda
    #callbacks
from keras.callbacks import TensorBoard, ModelCheckpoint, Callback


Using TensorFlow backend.


# Seeding

In [3]:
sd = 3
from numpy.random import seed
seed(sd)
from tensorflow import set_random_seed
set_random_seed(sd)

# CPU usage

In [4]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Global parameters

In [5]:
# Embedding
max_features = 400000
maxlen_text = 400
maxlen_summ = 80
embedding_size = 100 #128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 32
epochs = 20

# Load data

In [6]:
#data_dir = '/mnt/disks/500gb/experimental-data-mini/experimental-data-mini/generator-dist-1to1/1to1/'
data_dir = '/media/oala/4TB/experimental-data/experiment-1_nonconform-models/generator-dist/1to1/'
#processing_dir = '/mnt/disks/500gb/stats-and-meta-data/400000/'
processing_dir = '/media/oala/4TB/experimental-data/stats-and-meta-data/400000/'

with open(data_dir+'partition.pickle', 'rb') as handle: partition = pickle.load(handle)
with open(data_dir+'labels.pickle', 'rb') as handle: labels = pickle.load(handle)

with open(processing_dir+'tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle)
embedding_matrix = numpy.load(processing_dir+'embedding_matrix.npy')

#the p_n constant
c = 80000

# Model

In [7]:
#2way input
text_input = Input(shape=(maxlen_text,embedding_size), dtype='float32')
summ_input = Input(shape=(maxlen_summ,embedding_size), dtype='float32')

#2way dropout
text_route = Dropout(0.25)(text_input)
summ_route = Dropout(0.25)(summ_input)

#2way conv
text_route = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(text_route)
summ_route = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(summ_route)

#2way max pool
text_route = MaxPooling1D(pool_size=pool_size)(text_route)
summ_route = MaxPooling1D(pool_size=pool_size)(summ_route)

#2way lstm
text_route = LSTM(lstm_output_size)(text_route)
summ_route = LSTM(lstm_output_size)(summ_route)

#get dot of both routes
merged = Dot(axes=1,normalize=True)([text_route, summ_route])

#negate results
#merged = Lambda(lambda x: -1*x)(merged)

#add p_n constant
#merged = Lambda(lambda x: x + c)(merged)

#output
output = Dense(1, activation='sigmoid')(merged)

#define model
model = Model(inputs=[text_input, summ_input], outputs=[output])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train model

In [8]:
#callbacks
class BatchHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.accs = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.accs.append(logs.get('acc'))
        
history = BatchHistory()
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=True)
modelcheckpoint = ModelCheckpoint('best.h5', monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1)

#batch generator parameters
params = {'dim': [(maxlen_text,embedding_size),(maxlen_summ,embedding_size)],
          'batch_size': batch_size,
          'shuffle': True,
         'tokenizer':tokenizer,
         'embedding_matrix':embedding_matrix,
         'maxlen_text':maxlen_text,
         'maxlen_summ':maxlen_summ,
         'data_dir':data_dir,
         'sample_info':None}

#generators
training_generator = ContAllGenerator(partition['train'], labels, **params)
validation_generator = ContAllGenerator(partition['validation'], labels, **params)

# Train model on dataset
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=6,
                   epochs=epochs,
                   callbacks=[tensorboard, modelcheckpoint, history])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe0f99f0a20>

In [9]:
with open('losses.pickle', 'wb') as handle: pickle.dump(history.losses, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('accs.pickle', 'wb') as handle: pickle.dump(history.accs, handle, protocol=pickle.HIGHEST_PROTOCOL)