In [1]:
import re
from random import shuffle

import cufflinks
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from sklearn.utils import shuffle

import classifierutils
import dataread

STOPWORDS = set(stopwords.words('english'))

InteractiveShell.ast_node_interactivity = 'all'

cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [2]:
"""Variables"""

# headers = ['allergies', 'family_history', 'history_illness', 'social_history']
headers = dataread.read_file('top_sectionheaders_50000.txt')
no_sections = 5000

"""Import data"""

# header --> [header, original, tokenized, tokenized_labelled]
header_corpus = {}

for header in headers:
    header_corpus[header] = {}
    header_corpus[header]['label'] = header
    temp = dataread.read_file('section/'+header.replace(' ',
                                                        '_')+str(no_sections)+'.txt')
    new = []
    for sample in temp:
        kek = re.sub("\[.*?\]", "", sample)
        new.append(kek)

    header_corpus[header]['original'] = pd.Series(
        new
    )
    temp = classifierutils.corpus_preprocess(
        header_corpus[header]['original'],
        header
    )
    header_corpus[header]['labelled'] = temp[0]
    header_corpus[header]['labelled_tokenised'] = temp[1]

mixed_labelled = pd.DataFrame()
for value in header_corpus.values():
    mixed_labelled = mixed_labelled.append(value['labelled'])

mixed_labelled_tokenised = pd.DataFrame()
for value in header_corpus.values():
    mixed_labelled_tokenised = mixed_labelled_tokenised.append(
        value['labelled_tokenised'])

mixed_labelled = shuffle(mixed_labelled)

'Variables'

'Import data'

In [3]:
mixed_labelled.HEADER.value_counts()

history of present illness              4707
past medical history                    4674
allergies                               4556
social history                          4302
discharge medications                   4080
medications on admission                3947
chief complaint                         3854
family history                          3853
discharge disposition                   3593
brief hospital course                   3541
major surgical or invasive procedure    3526
Name: HEADER, dtype: int64

In [4]:
mixed_labelled['HEADER'].value_counts().sort_values(ascending=False).iplot(kind='bar', yTitle='Number of Samples', 
                                                                title='Number of samples in each section header')

In [5]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(mixed_labelled['TEXT'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 28887 unique tokens.


In [9]:
from keras.models import load_model

model = load_model("model/LTSM_RNN_2EP_91.h5")

In [7]:
test = ["""No Known Drug Allergies"""]
seq = tokenizer.texts_to_sequences(test)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = sorted(headers)
print(labels)
print(pred, labels[np.argmax(pred)])

['allergies', 'brief hospital course', 'chief complaint', 'discharge disposition', 'discharge medications', 'family history', 'history of present illness', 'major surgical or invasive procedure', 'medications on admission', 'past medical history', 'social history']
[[9.9986386e-01 2.0139732e-06 5.0663428e-05 3.2847498e-07 7.6123565e-06
  7.2473140e-06 2.8661591e-06 3.6428909e-07 2.0528856e-05 3.3623091e-05
  1.0852346e-05]] allergies


In [17]:
import dataclass
import importlib
dataclass = importlib.reload(dataclass)
samples = pd.DataFrame()
for value in header_corpus.values():
    samples = samples.append(value['labelled'])

data = dataread.read_samples(2)
data_class = []
for item in data:
    data_class.append(
        dataclass.Sample(item)
    )

labels = sorted(headers)

for item in data_class:
    item.paragraph_classify(model, tokenizer, labels)

export = []
for item in data_class:
    item.paragraph_classify_indexed()
    print(item.paragraph_classifed_print())
    export.append(item.paragraph_classifed_print())

dataread.save_array(export, 'test2.txt')
    

# labels = ['Credit reporting, credit repair services, or other personal consumer reports', 'Debt collection', 'Mortgage', 'Credit card or prepaid card', 'Student loan', 'Bank account or service', 'Checking or savings account', 'Consumer Loan', 'Payday loan, title loan, or personal loan', 'Vehicle loan or lease', 'Money transfer, virtual currency, or money service', 'Money transfers', 'Prepaid card']
print(pred, labels[np.argmax(pred)])

# corpus = []
# for()

AttributeError: '_UserObject' object has no attribute 'predict'

In [16]:
# Load Model
import os
import tempfile

from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf

model = tf.saved_model.load('model/alpha')

