### Basic Imports

In [58]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import time

### Load Data

In [59]:
import pandas as pd

patients = pd.read_csv('internacoes_charlson_zero.csv.gz', compression='gzip', nrows=None)
target = patients['target'].values
patients.shape

(48907, 9)

### Split a Smaller Set

In [60]:
from sklearn.model_selection import StratifiedKFold

split_kfold = StratifiedKFold(n_splits=2, shuffle=True)
for trash, used in split_kfold.split(patients.index.values, target):
    break
    
target_set = np.asarray(patients.iloc[used]['target'].values)
text_set = patients.iloc[used]['text'].values

print('Data Size:', len(used))
print('Mean Tokens:', np.mean(patients.iloc[used]['wc'].values))

Data Size: 24459
Mean Tokens: 6020.543889774725


### Load Word2Vec Model

In [61]:
from gensim.models.word2vec import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format('health_w2v_unigram_150.bin', binary=True)
len(w2v_model.vocab)

67805

### Tokenize Clinical Notes
Remove accents and stopwords, It take a while...

In [62]:
import unicodedata
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def tokenizer(text):
    return_text = []
    sw_port = stopwords.words("portuguese")
    for sentence in text:
        reg_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        tokens = reg_tokenizer.tokenize(sentence)
        return_text.append(' '.join([remove_accents(w.lower()) for w in tokens if w not in sw_port]))
        
    return return_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/grupopln/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
start = time.time()
tokens_set = tokenizer(text_set)
print('Takes ', round(time.time() - start), ' s for', len(used), ' instances')

Takes  485  s for 24459  instances


### Text Vector Representation

In [64]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

start = time.time()

max_words = len(w2v_model.vocab)
tokenize = Tokenizer(num_words=max_words)
tokenize.fit_on_texts(tokens_set)

max_length = 5000
sequences = tokenize.texts_to_sequences(tokens_set)
data_matrix = pad_sequences(sequences, maxlen=max_length)

print('Takes ', round(time.time() - start), ' s for', len(used), ' instances')

Takes  113  s for 24459  instances


In [65]:
len(data_matrix[0][data_matrix[0]!=0]), len(tokens_set[0])

(393, 2390)

### Setup Words Weights for Embedding Layer

In [66]:
vocab_dim = len(w2v_model.word_vec('0'))
word_index = tokenize.word_index
n_symbols = min(max_words, len(word_index))+1

embedding_weights = np.zeros((n_symbols, vocab_dim))
for word, i in word_index.items():
    if i >= n_symbols: break
    if word in w2v_model.vocab:
        embedding_weights[i] = w2v_model.word_vec(word)

print('Symbols', n_symbols)
print('Weights', embedding_weights.shape)

Symbols 67806
Weights (67806, 150)


### Setup RNN Layers

In [67]:
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Conv1D, Bidirectional, Flatten, MaxPooling1D, Dropout
from keras.layers.embeddings import Embedding
from keras.models import Sequential

conv_0 = Conv1D(50, 3, activation='relu', name='conv0')

lstm_0 = LSTM(units=50, recurrent_activation="hard_sigmoid", activation="relu", name='lstm0')

bi_lstm_0 = Bidirectional(lstm_0, name='bilstm0')

embedding_layer = Embedding(embedding_weights.shape[0],
                            embedding_weights.shape[1],
                            weights=[embedding_weights],
                            input_length=max_length)

### Create Model, Train and Evaluate

In [None]:
print ('Defining a RNN Model...')
   
kfold = StratifiedKFold(n_splits=6)
cvscores = []
times = []
values = []
predictions = []

print('Data Shape', data_matrix.shape)
print('Weights', embedding_weights.shape)

for i, (train, test) in enumerate(kfold.split(data_matrix, target_set)):
    
    if i > 2: break
    
    start = time.time()
    print('Creating model...')
    # create model
    model = Sequential()
    model.add(embedding_layer)
    
    #model.add(Conv1D(128, 5, activation='relu'))
    #model.add(MaxPooling1D(2))
    #model.add(Conv1D(128, 5, activation='relu'))
    #model.add(MaxPooling1D(5))
    
    #model.add(Conv1D(50, 3, activation='relu'))
    #model.add(lstm_0)
    #model.add(bi_lstm_0)
    model.add(Flatten())
    
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.5))    
    model.add(Dense(1))
    model.add(Activation('relu'))
    
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='sgd',metrics=['mae'])
    
    # Fit the model
    model.fit(data_matrix[train], target_set[train], epochs=20, batch_size=100, verbose=1)

    # evaluate the model
    scores = model.evaluate(data_matrix[test], target_set[test], verbose=0)
    target_pred = model.predict(data_matrix[test])
    
    values.extend(target_set[test])
    predictions.extend(target_pred)
    
    print(model.metrics_names[1], scores[1])
    cvscores.append(scores[1])
    times.append(time.time() - start)
    
print('Mean: ', np.mean(cvscores), 'Std: ', np.std(cvscores))
print('Time: ', np.mean(times))

Defining a RNN Model...
Data Shape (24459, 5000)
Weights (67806, 150)
Creating model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

### Regroup Values

In [None]:
import gzip, pickle

values = np.reshape(values,len(values))
predictions = np.reshape(predictions,len(predictions))

tuples = np.stack((values,np.round(predictions,1)), axis=-1) ## ROUND TUPLES

with gzip.open("tuples.pkl.gz", "wb") as wfp:   #Pickling
    pickle.dump(tuples, wfp)
    wfp.close()

In [None]:
with gzip.open("tuples.pkl.gz", "rb") as wfp:   #Pickling
    tuples = pickle.load(wfp)
    wfp.close()
    
frequencies = {}
for x,y in tuples:
    key = (x, y)
    if key in frequencies:
        frequencies[key] += 1
    else:
        frequencies[key] = 1

size = []
x = []
y = []
for key in frequencies.keys():
    size.append(frequencies[key])
    x.append(key[0])
    y.append(key[1])

### Plot Scatter

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

maxX = np.max(x)

plt.figure(figsize=(8, 4), dpi= 300)
plt.xlabel('Charlson Value', fontsize=10)
plt.ylabel('Prediction Value', fontsize=10)
plt.xticks(range(int(maxX)+1))
plt.yticks(range(int(maxX)+1))

plt.scatter(x,y,s=size)
plt.show()