In [1]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline
# This blokc is important if we want the memory to grow on the GPU, and not block allocate the whole thing
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
from datetime import datetime
import os
from pathlib import Path
from sklearn.utils import class_weight 
# Hyperparam opt
import talos as ta

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

# Set path to find modelling tools for later use
import sys
sys.path.append(os.path.join(os.getcwd(),".."))
# Global params live here
import haberrspd.charCNN.globals

Using TensorFlow backend.


In [None]:
# Load data-loader
from haberrspd.charCNN.auxiliary_tf import create_training_data, 
# Load model
from haberrspd.charCNN.models_tf import char_cnn_model, char_cnn_model_talos

## Load training data and validation data as well as auxiliary model parameters

In [None]:
DATA_ROOT = Path("../data/") / "MJFF" / "preproc" # Note the relative path
# Load training data and auxiliary variables
X_train, X_test, y_train, y_test, max_sentence_length = \
create_training_data(DATA_ROOT,"EnglishData-preprocessed.csv",'sentence')
class_weights = class_weight.compute_class_weight('balanced',list(set(y_train)),y_train)

In [None]:
# Rev-up tensorboard
# logdir="../logs/char-cnn-keras-" + datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = TensorBoard(log_dir=logdir)
# Set model
model = char_cnn_model(max_sentence_length)
model.summary()

## Assign a loss function

In [None]:
loss_func = 'squared_hinge'

if loss_func == 'hinge' or loss_func == 'squared_hinge':
    y_train = [-1 if x==0 else x for x in y_train]
    y_test = [-1 if x==0 else x for x in y_test]
    
if loss_func == 'binary_crossentropy':
    # Check if label-space is correct
    if (-1 in y_train) or (-1 in y_test):
        y_train = [0 if x==-1 else x for x in y_train]
        y_test = [0 if x==-1 else x for x in y_test]

## Train model

In [None]:
# Compile
model.compile(loss=loss_func,  # TODO: change to cosine loss, cosine_proximity, binary_crossentropy
              optimizer='adam',            # TODO: check which is most appropriate
              metrics=['accuracy'])        # Probs other options here which are more useful

# Check if checkpoints dir exists, if not make it
if not os.path.exists('../../keras_checkpoints'):
    os.makedirs('../../keras_checkpoints')

# Callbacks
file_name = "char-CNN"
check_cb = ModelCheckpoint(file_name + '.{epoch:02d}-{val_loss:.2f}.hdf5',
                           monitor='val_loss',
                           verbose=0,
                           save_best_only=True,
                           mode='min')

earlystop_cb = EarlyStopping(monitor='val_loss',
                             patience=7,
                             verbose=0,
                             mode='auto')

# history = LossHistory()
"""
TODO:

-Add class-weight option to take into account class-imbalance on patients and controls
"""
fit_hist = model.fit(X_train,
                     y_train,
                     validation_data=(X_test, y_test),
                     verbose=0, # Set to zero if using live plotting of losses
                     class_weight = class_weights,
                     batch_size=128,
                     epochs=40,
                     #shuffle=True, # Our data is already shuffled during data loading
                     callbacks=[
                                #check_cb,
                                #tensorboard_callback,
                                PlotLossesCallback(),
                                #earlystop_cb
                               ]
                    )

# TALOS: hyperparameter optimisation

In [3]:
from haberrspd.charCNN.models_tf import char_cnn_model_talos
from haberrspd.charCNN.auxiliary_tf import create_training_data
from numpy import vstack, asarray

### Main

In [25]:
class_weights,max_sentence_length = None,None # These needs to be set
# Set the parameter space
opt_params ={'conv_output_space' : [8,16,32,64],
             'number_of_large_filters' : [1,2,3,4],             
             'number_of_small_filters' : [1,2,3,4],
             'large_filter_lengt' : [30,60,120],
             'small_filter_lengt' : [5,10,20],
             'pool_length' : [2,4,8,16],
             'dense_units_layer_3' : [32,16,8,4],
             'dense_units_layer_2' : [32,16,8,4],
             'batch_size': [8,16,32,64],
             'epochs': [200], # Use early stopping instead
             'dropout': (0, 0.5, 5),
             'conv_kernel_initializer': ['uniform','normal'],
             'conv_bias_initializer': ['uniform','normal'],
             'dense_kernel_initializer': ['uniform','normal'],
             'dense_bias_initializer': ['uniform','normal'],
             'optimizer': ['adam', 'nadam', 'rmsprop'],
             'loss': ['logcosh', 'binary_crossentropy'],
             'conv_activation':['relu', 'elu'],
             'dense_activation':['relu', 'elu'],
             'last_activation': ['sigmoid'],
             # Stationary parameters, i.e. do not get optimised
             'max_sentence_length':[max_sentence_length]
            }

"""
'sgd': SGD,
'rmsprop': RMSprop,
'adagrad': Adagrad,
'adadelta': Adadelta,
'adam': Adam,
'adamax': Adamax,
'nadam': Nadam
"""

def size_of_optimisation_space(opt_params):
    space = 1
    for attribute in opt_params.keys():
        space*=len(opt_params[attribute])
        
    return space

int(size_of_optimisation_space(opt_params)*0.01)

382205952

### Small

In [None]:
from keras.optimizers import Adam, Nadam

DATA_ROOT = Path("../data/") / "MJFF" / "preproc" # Note the relative path
# Load training data and auxiliary variables
X_train, X_test, y_train, y_test, max_sentence_length = \
create_training_data(DATA_ROOT,"EnglishData-preprocessed.csv",'sentence')
class_weights = dict(zip([0,1], 
                         class_weight.compute_class_weight('balanced',
                                                           list(set(y_train)),
                                                           y_train)))

params ={
         # Learning rate
         'lr': (0.1, 10, 2),
         'conv_output_space' : [4],#,8],
         'number_of_large_filters' : [2],             
         'number_of_small_filters' : [2],
         'large_filter_length' : [60],
         'small_filter_length' : [5],
         'pool_length' : [2],
         'dense_units_layer_3' : [32],
         'dense_units_layer_2' : [16],
         'batch_size': [4,8],
         'epochs': [100],
         'dropout': [0.05],#,0.1,0.2],
         'conv_kernel_initializer': ['uniform'],
         'conv_bias_initializer': ['uniform'],
         'dense_kernel_initializer': ['uniform'],
         'dense_bias_initializer': ['uniform'],
         'optimizer': [Adam],
         'loss': ['binary_crossentropy'],
         'conv_activation':['relu'],
         'dense_activation':['relu'],
         'last_activation': ['sigmoid'],
         # Stationary parameters, i.e. do not get optimised
         'max_sentence_length':[max_sentence_length]
            }

### Run Talos

In [None]:
t = ta.Scan(vstack([X_train,X_test]), 
            asarray(y_train+y_test).reshape(-1, 1), 
            model=char_cnn_model_talos,
            disable_progress_bar=False,
            params=params, 
            val_split=0.2)
#             grid_downsample=0.01,  # Randomly samples 1% of the grid

In [10]:
# Get weights passed to activation function
from talos import Deploy, Predict
from haberrspd.charCNN.auxiliary_tf import binarize

In [8]:
p = Predict(t)

In [None]:
p.predict(binarize(X_test[0]))

In [None]:
for x in X_test:
    print(p.predict(x))

In [39]:
from tensorflow import one_hot, float32, cast

In [43]:
def binarize_test(x):
    """
    # XXX: change this to use a smaller dimensional representation of each character, see footnote 3 of paper `Character-aware neural language model`

    # Example from Torch: torch.nn.Embedding(big number, much smaller number)

    Parameters
    ----------
    x : string
        A character string from a sentence
    Returns
    -------
    Tensor
        A one-hot encoded tensor-representation of a character
    """
    # TODO: double-check how this encoding actually prints out [ 0 0 0 0 1 ...] etc
    return cast(one_hot(x,
                        49,
                        on_value=1,
                        off_value=0,
                        axis=-1),
                float32)  # TODO: check precision

In [44]:
binarize_test(X_test[0])

<tf.Tensor 'Cast:0' shape=(14000, 49) dtype=float32>

In [30]:
t.saved_weights[-1][-2:]

[array([[-0.00666015],
        [-0.0045552 ],
        [ 0.00290709],
        [-0.00390608],
        [-0.02322032],
        [ 0.0098001 ],
        [-0.00305747],
        [ 0.01425729],
        [ 0.06003369],
        [-0.00109358],
        [-0.01998529],
        [-0.02026838],
        [-0.00320968],
        [-0.03552249],
        [-0.03317583],
        [ 0.00533802]], dtype=float32), array([0.03063761], dtype=float32)]

In [None]:
Deploy(t, metric='val_acc', model_name='test_last_weights')

In [None]:
t.data.sort_values(by=['val_acc'],ascending=False)[['val_acc', 'batch_size','dropout']]

### Keras one-hot encoding

In [7]:
from haberrspd.charCNN.auxiliary_tf import create_mjff_data_objects
from pandas import read_csv

In [15]:
DATA_ROOT = Path("../data/") / "MJFF" / "preproc" # Note the relative path
df = read_csv(DATA_ROOT / "EnglishData-preprocessed.csv", header=0)  # MJFF data
# TODO: make the fuck sure that subject diagnoses are correctly ordered
subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)
print("Length of alphabet is: {}".format(len(alphabet)))

Length of alphabet is: 49


In [71]:
# Store alphabet size
alphabet_size = len(alphabet)
print('Total number of characters:', alphabet_size)
alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))
# indices_alphabet = dict((i, c) for i, c in enumerate(alphabet))

Total number of characters: 49


In [74]:
tk.word_index = alphabet_indices

In [78]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [53]:
# Initialise
tk = Tokenizer(num_words=None, char_level=True)
# Fit to text
test = ['aaaaaaBBBB!"#$%&() ', 'I am a little teapot']
tk.fit_on_texts(test)

In [62]:
seqs = tk.texts_to_sequences(test)

In [64]:
# Padding
test_seqs =  pad_sequences(seqs, maxlen=40, padding='post')
test_seqs

array([[ 1,  1,  1,  1,  1,  1,  3,  3,  3,  3,  8,  9, 10, 11, 12, 13,
        14, 15,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [ 5,  2,  1, 16,  2,  1,  2,  6,  5,  4,  4,  6,  7,  2,  4,  7,
         1, 17, 18,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

In [82]:
def create_training_data_keras(DATA_ROOT, 
                               data_string):
    """
    This function creats one-hot encoded character -data for the document (=subject)
    classification model, as well as the sentence classification model. The functionality
    within is keras specific.

    Parameters
    ----------
    DATA_ROOT : str
        Location of the MJFF data folder
    data_string : str
        The .csv file that we want to analyse

    Returns
    -------
    tuple
        Contains the training and test data as well as some parameters
    """
    assert type(data_string) is str

    df = read_csv(DATA_ROOT / data_string, header=0)  # MJFF data
    subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)

    # Store alphabet size
    alphabet_size = len(alphabet)

    print('Total number of characters:', alphabet_size)
    alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))

    # Rounds (up) to nearest thousand
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -3)

    # Make training data array
    all_sentences = [item for sublist in subject_documents for item in sublist]
    
    # Initialise tokenizer which maps characters to integers
    tk = Tokenizer(num_words=None, char_level=True)
    
    # Fit to text: convert all chars to ints
    tk.fit_on_texts(all_sentences)
    
    # Update alphabet 
    tk.word_index = alphabet_indices
    
    # Get integer sequences: converts sequences of chars to sequences of ints
    int_sequences = tk.texts_to_sequences(all_sentences)
    
    # Pad sequences so that they all have the same length and then one-hot encode
    X = to_categorical(pad_sequences(int_sequences, maxlen=max_sentence_length, padding='post'))
    
    # Get labels (diagnoses)
    y = df.Diagnosis.tolist()

    # Chop up data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)
    return X_train, X_test, y_train, y_test, max_sentence_length

In [83]:
X_train, X_test, y_train, y_test, max_sentence_length = create_training_data_keras(DATA_ROOT, "EnglishData-preprocessed.csv")

Total number of characters: 49
