In [1]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

# Set path to find modelling tools for later use
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))


from haberrspd.preprocess import preprocessMJFF, preprocessMRC
from haberrspd.charCNN.data_utilities import create_training_data_keras, create_data_objects
                         
import pandas as pd
import re
import numpy as np
print(np.__version__)
from collections import Counter, defaultdict
import itertools
from operator import itemgetter
from scipy.stats import (gamma, lognorm, gengamma)

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from pathlib import Path
from IPython.display import display

# Plot stuff
import seaborn as sns
from scipy.constants import golden
import matplotlib.pyplot as plt
plt.style.use('dark_background')

Using TensorFlow backend.


1.16.4


In [2]:
df = pd.read_csv(Path("../data") / 'MJFF' / 'preproc' / 'char' / 'SpanishData-preprocessed.csv', header=0)

In [3]:
X = defaultdict(list)
y = defaultdict(list)

In [4]:
for s in df.Patient_ID.unique():
    X[s] = df[(df.Patient_ID == s)]['Preprocessed_typed_sentence'].tolist()
    y[s] = df[(df.Patient_ID == s)]['Diagnosis'].unique().tolist()

In [5]:
subject_documents, subject_locations, subjects_diagnoses, alphabet = create_data_objects(df)

In [6]:
from math import ceil
def roundup(x):
    return int(ceil(x / 100.0)) * 100
all_sentences = [item for sublist in subject_documents for item in sublist]
max_sentence_length = roundup(max([len(s) for s in all_sentences]))
# Store alphabet size
alphabet_size = len(alphabet)
print("Total number of characters used in all typed sentences:", alphabet_size)
alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))

Total number of characters used in all typed sentences: 54


In [7]:
# Initialise tokenizer which maps characters to integers
tk = Tokenizer(num_words=None, char_level=True)
# Fit to text: convert all chars to ints
tk.fit_on_texts(all_sentences)
# Update alphabet
tk.word_index = alphabet_indices

In [8]:
X_int = defaultdict(list)
for s in df.Patient_ID.unique():
    # Get integer sequences: converts sequences of chars to sequences of ints
    X_int[s] = tk.texts_to_sequences(" ".join(X[s]))

In [9]:
XX = defaultdict(list)
yy = defaultdict(list)
for i,s in enumerate(df.Patient_ID.unique()):
    XX[i] = to_categorical(X_int[s])
    yy[i] = y[s][0]

In [10]:
XX[0].shape

(3443, 54)

In [24]:
XX.keys()

dict_keys([100, 110, 121, 130, 132, 134, 142, 147, 148, 153, 154, 157, 159, 164, 165, 166, 170, 171, 172, 173])

In [27]:
y.keys()

dict_keys([100, 110, 121, 130, 132, 134, 142, 147, 148, 153, 154, 157, 159, 164, 165, 166, 170, 171, 172, 173])

In [40]:
data = defaultdict(list)
for s in df.Patient_ID.unique():
    data[s] = (XX[s],y[s][0])

----

### DL

In [11]:
from numpy.random import randint
class DataHandler(object):
    
    def __init__(self, X, y, window_len, n_samp):
        self.X = X
        self.y = y
        self.window_len = window_len
        self.n_samp=n_samp
        
        
    def get_sample(self,idx):
        x_len = self.X[idx].shape[0]
        window_start = randint(0,x_len - self.window_len)
        window_end = window_start + self.window_len
        return self.X[idx][window_start:window_end], self.y[idx]
    
    
    def gen_epoch(self):
        X_epoch = []
        y_epoch = []
        
        for idx in range(len(self.X)):
            
            for s in range(self.n_samp):
                x,y = self.get_sample(idx)
                X_epoch.append(x)
                y_epoch.append(y)
                
                
                
        return np.asarray(X_epoch), np.asarray(y_epoch)
            


In [12]:
XX_list = []
yy_list = []

for key, _ in XX.items():
    XX_list.append(XX[key])
    yy_list.append(yy[key])

In [113]:
train_datahandler = DataHandler(X=XX_list[4:], y=yy_list[4:],window_len = 60,n_samp = 3)
val_datahandler = DataHandler(X=XX_list[:4], y=yy_list[:4],window_len = 60,n_samp = 100)

In [102]:
X, y = train_datahandler.gen_epoch()

In [103]:
X.shape

(6400, 60, 54)

In [114]:
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
model = load_model('EXPERIMENTAL_MODE.h5')

In [115]:
model.input

<tf.Tensor 'input_1_16:0' shape=(?, 60, 54) dtype=float32>

In [118]:
K.set_value(model.optimizer.lr, 1e-3)
K.get_value(model.optimizer.lr)

0.001

In [107]:
model.loss

'binary_crossentropy'

In [60]:
test1 = np.ones_like(X)
test2 = np.zeros_like(X)
X_test = np.concatenate([test1,test2])

y_test = np.concatenate([np.ones_like(y),np.zeros_like(y)])

In [119]:


n_epochs = 100
X_val, y_val = val_datahandler.gen_epoch()
X, y = train_datahandler.gen_epoch() 
for epoch in range(n_epochs):
    
    model.fit(X,y, validation_data=(X_val,y_val),batch_size=32,epochs = 1,shuffle = True )
    


Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 48 samples, validate on 400 samples
Train on 4

In [95]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

# =====================Char CNN=======================
# parameter
input_size = 1014
vocab_size = len(tk.word_index)
embedding_size = 54
conv_layers = [[256, 7, 3],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = 4
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'



# Model Construction
# Input
input_shape = (100,54)
x = Input(shape=input_shape, name='input', dtype='float32')  # shape=(?, 1014)
# Embedding

# Conv
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
x = Flatten()(x)  # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Adam, categorical_crossentropy
model.summary()

ValueError: Graph disconnected: cannot obtain value for tensor Tensor("input_4:0", shape=(?, 100, 54), dtype=float32) at layer "input". The following previous layers were accessed without issue: []

In [91]:
model.input

<tf.Tensor 'input_1:0' shape=(?, 1014) dtype=int64>