<a href="https://colab.research.google.com/github/nfrn/Deep-Learning-for-Health-Text-Mining/blob/master/Lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clinical Text Classification

In [None]:
! pip install requests
! pip install pandas
! pip install nltk
! pip install matplotlib
! pip install Keras
! pip install tensorflow
! pip install scikit-learn
! pip install keras-tqdm
! pip install sklearn-crfsuite
! pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
import nltk
nltk.download('punkt')

In [None]:
pip install --upgrade tensorflow-addons

## Table of Contents:
* Description of the Data
* Creation of the Dataset
* Naive Bayes
* Support Vector Machines
* Multi-Layer-Perceptron
* Convolutional Neural Networks
* Recurrent Neural Networks
* Contest. Who can build the best Model?

## Dataset Open-i:
### Description:
* Open-i service of the National Library of Medicine enables search and retrieval of abstracts and images (including charts, graphs, clinical images, etc.) from the open source literature, and biomedical image collections. Searching may be done using text queries as well as query images.
* Open-i provides access to over 3.7 million images from about 1.2 million PubMed Central® articles; 7,470 chest x-rays with 3,955 radiology reports; 67,517 images from NLM History of Medicine collection; and 2,064 orthopedic illustrations.
* Link to the website: https://openi.nlm.nih.gov/

### Data Visualization:
# ![ola](https://github.com/nfrn/Deep-Learning-for-Health-Text-Mining/blob/master/Images/example.png?raw=1)

### Download Reports XML:

In [None]:
import requests
url = "https://openi.nlm.nih.gov/imgs/collections/NLMCXR_reports.tgz"
filename = url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)
print("Finished")

#### After the script finishes, a new file NLMCXR_reports.tgz should be visible on your working directory

### Extract files:

In [None]:
import tarfile
fname="NLMCXR_reports.tgz"
tar = tarfile.open(fname)
tar.extractall()
tar.close()
print("Finished")

#### After the script finishes, a new folder is created: "./ecgen-radiology" with all the .xml files

### Visualize XML files:

In [None]:
from pygments import highlight
from pygments.lexers import XmlLexer
from xml.dom import minidom

from pygments.formatters import HtmlFormatter
import IPython
from IPython.core.display import HTML

def display_xml_nice(xml_element):
    formatter = HtmlFormatter()
    xml_indented = xml_element.toprettyxml(indent='  ',newl="")
    IPython.display.display(HTML('<style type="text/css">{}</style>    {}'.format(
    formatter.get_style_defs('.highlight'),
    highlight(xml_indented, XmlLexer(), formatter))))
    
domf = minidom.parse('./ecgen-radiology/1.xml')
display_xml_nice(domf)

### Important XML Tags
* AbstractText Label="COMPARISON"
* AbstractText Label="INDICATION
* AbstractText Label="FINDINGS"
* AbstractText Label="IMPRESSION"
* MeSH: Medical Subject Headings is the NLM controlled vocabulary thesaurus used for indexing articles for PubMed.

### Create Dataset:

In [None]:
import glob
import re
import os.path
import pandas as pd
import numpy as np
import xml.dom.minidom as minidom

def getText(file):
    # Extract the clinical report by mergin the Text Labels
    stringa = file.find("<Abstract>")
    stringb = file.find("</Abstract>")
    if stringa == -1 or stringb == -1:
        print("No abstract")
        return "NO ABSTRACT"
    all = file[stringa:stringb]
    all = re.sub("<Abstract>", "", all)
    all = re.sub("<AbstractText Label=", "", all)
    all = re.sub("</AbstractText>", "", all)
    all = re.sub(" +", " ", all)
    all = re.sub('\"', "", all)
    all = re.sub('>', " ", all)
    all = re.sub('\n', "", all)
    return all

def getLabels(doc):
    # Extract the MeSH labels from the xml doument
    value = []
    for idx2, node in enumerate(doc.getElementsByTagName('MeSH')):
        for elem in node.childNodes:
            string = elem.toxml()
            string = re.sub("<automatic>", "", string)
            string = re.sub("</automatic>", "", string)
            string = re.sub("<major>", "", string)
            string = re.sub("</major>", "", string)
            string = re.sub("\n", "", string)
            if "  " not in string:
                value.append(string)

    return value

def xmlToDF():
    # Go through each XML file and saving the clinical report and the Mesh Labels in a csv dataset
    df = pd.DataFrame(columns=["Labels","Report"])
    for idx, file in enumerate(glob.glob("./ecgen-radiology/*.xml")):
        doc = minidom.parse(file)
        file = doc.toxml()

        labels = getLabels(doc)
        text = getText(file)

        df.at[idx, "Labels"]= labels
        df.at[idx, "Report"]= text


    df.to_csv("dataset.csv",index=False)
    print("Finished")
    
xmlToDF()

### Visualize Draft Dataset:

In [None]:
import pandas as pd
from IPython.display import display_html
df = pd.read_csv("dataset.csv", nrows=100)
df2_styler = df.reset_index(drop=True).style.set_table_attributes("style='display:inline'").set_caption('Dataset Entries')
display_html(df2_styler._repr_html_(), raw=True)

### Polish Draft Dataset:

In [None]:
from IPython.display import display_html
from nltk.tokenize import word_tokenize

def labelsprocessing():
    # Clean the pontuations of the Labels we collected
    df = pd.read_csv("dataset.csv")
    for idx,x in enumerate(df["Labels"]):
        x = re.sub("","",x)
        x = re.sub("\[", "", x)
        x = re.sub("]", "", x)
        x = re.sub("/", " ", x)
        x = re.sub(",", " ", x)
        x = re.sub("'", " ", x)
        x = re.sub("{ }+ ", " ", x)
        x = re.sub(' +', ' ',x)
        words = word_tokenize(x)
        words = [word.lower() for word in words]
        df.at[idx, "Labels"]= " ".join(words)
        
    # Visualize the dataset
    df.to_csv("dataset.csv",index=False)
    df_styler = df.reset_index(drop=True).style.set_table_attributes("style='display:inline'").set_caption('Dataset Entries')
    display_html(df_styler._repr_html_(), raw=True)
        
labelsprocessing()


In [None]:
from IPython.display import display_html

ITEMS = {'No findings':['normal'],
             'Enlarged Cardiomediastinum': ['enlarged mediastinum'],
             'Cardiomegaly': ['cardiomegaly'],
             'Airspace Opacity': ['opacity'],
             'Lung Lesion': ['lung'],
             'Edema': ['edema','edemas'],
             'Consolidation': ['consolidation'],
             'Pneumonia': ['pneumonia'],
             'Atelectasis': ['atelectasis'],
             'Pneumothorax': ['pneumothorax','hydropneumothorax'],
             'Pleural Effusion': ['pleural effusion','pleural effusions'],
             'Pleural Other': ["pleural thickening",'pleural diseases'],
             'Fracture': ['fracture','fractures'],
             'Support Devices': ['medical device']}

def transformToLabels():
    
    df = pd.read_csv("dataset.csv")
    
    #Initialize each label column with 0
    for label in ITEMS.keys():
        df[label] = 0
        
    #For each report, we check if the Mesh contains the 14 clinical labels we plan to identify.
    for label in ITEMS.keys():
        for code in ITEMS.get(label):
            if code == "normal":
                idx = df.index[df['Labels'] == "normal"]
                df.loc[idx,label] = 1
            else:
                df.loc[df['Labels'].str.contains(code), label] = 1
        
    # Visualize the dataset
    df.to_csv("dataset.csv",index=False)
    df_styler = df.reset_index(drop=True).style.set_table_attributes("style='display:inline'").set_caption('Dataset Entries')
    display_html(df_styler._repr_html_(), raw=True)
    

transformToLabels()


In [None]:
# Instances with no "patologies labels" with 1s should have the "label No Findings" with 1.
from IPython.display import display_html
import pandas as pd

LABELS = ['No findings','Enlarged Cardiomediastinum','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']


def noFindings():
    # If a report as all labels with 0s, we give 1 to the No Findings Labels
    df2 = pd.read_csv("dataset.csv")
    df = pd.read_csv("dataset.csv",usecols=LABELS)
    values = df.sum(axis=1)
    for idx,total in enumerate(values):
        if total==0:
            df2.at[idx,'No findings']=1
            
    # Visualize the dataset
    df2 = df2.drop(['Labels'], axis=1)
    df2.to_csv("dataset.csv",index=False)
    df_styler = df2.reset_index(drop=True).style.set_table_attributes("style='display:inline'").set_caption('Dataset Entries')
    display_html(df_styler._repr_html_(), raw=True)
    
noFindings()

        

### Visualize Final Dataset:

In [None]:
import pandas as pd
from IPython.display import display_html
df = pd.read_csv("dataset.csv", nrows=100)
df2_styler = df.reset_index(drop=True).style.set_table_attributes("style='display:inline'").set_caption('Dataset Entries')
display_html(df2_styler._repr_html_(), raw=True)

### Visualize Dataset Statistics:

#### Visualize the Class Imbalance

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

LABELS = ['No findings','Enlarged Cardiomediastinum','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']

def visualizeClassImbalance():
    # View the class count distribution
    df = pd.read_csv("dataset.csv",usecols =LABELS)
    counts = []
    categories = list(df.columns.values)
    for i in categories:
        counts.append((i, df[i].sum()))
    df_stats = pd.DataFrame(counts)
    
    df_stats.plot(kind='bar', legend=False, grid=True, figsize=(8, 5))
    plt.title("Number of instances per label")
    plt.ylabel('# of Occurrences', fontsize=12)
    plt.xlabel('Label', fontsize=12)
    plt.show()

visualizeClassImbalance()

#### Visualize Reports Length

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.ticker import FuncFormatter
def visualizeReportsLength():
    df = pd.read_csv("dataset.csv",usecols =['Report'])
    df['text_length'] = df['Report'].str.split().str.len()
    tlen = df['text_length'].values

    fig, ax = plt.subplots()
    plt.hist(tlen, bins=np.arange(max(tlen)), histtype='barstacked', linewidth=2)
    plt.title("Length of reports")
    plt.ylabel('# of Instances', fontsize=12)
    plt.xlabel('Length of reports', fontsize=12)
    plt.show()

visualizeReportsLength()

### Preprocess Dataset:

#### Extract the clinical text reports

In [None]:
from keras.preprocessing.text import text_to_word_sequence
import pandas as pd

def prepareTextFeatures():
    df = pd.read_csv("dataset.csv",usecols =['Report'])
    texts = df.values
    processeddocs = []
    for idx, sentence in enumerate(texts):
        processedSentence = text_to_word_sequence(sentence[0])
        corpus = ''
        for word in processedSentence:
            corpus = corpus + ' ' + word
        processeddocs.append(corpus)
        
    print(processeddocs[0])
    return processeddocs
corpus = prepareTextFeatures()

### TF-IDF = short for term frequency–inverse document frequency:
* Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length: $TF(t,d) = \frac{Count(t)}{Length(d)}$.
* IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following: 
$IDF(t,d) = \ln{\frac{Count(d)}{Count(d, t \in d)}}$.
* TF-IDF: Combines both approaches, by computing the following: ${TF-IDF}(t,d) = TF(t,d) \cdot IDF(t,d)$



### Auc-ROC Metric
* Performance measurement important for multi-class classification
* It tells how much model is capable of distinguishing between classes. Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s.
* By analogy, Higher the AUC, better the model is at distinguishing between patients with disease and no disease.
* You can understand more about this metric in the following article: https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

### Attempt 1 Naive Bayes:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction import text
import pandas as pd

LABELS = ['No findings','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']

df = pd.read_csv("dataset.csv")
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train["Report"].values
X_test = test["Report"].values

ngram=4
minf=0.05
maxf=0.95
stopwords = text.ENGLISH_STOP_WORDS

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(sublinear_tf=True,norm='l2',stop_words=stopwords,
                                          strip_accents='ascii', lowercase=True, ngram_range=(1, ngram),
                                          min_df=minf, max_df=maxf)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

total=0
for category in LABELS:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    fpr, tpr, _ = roc_curve(test[category], prediction)
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('Test auc is {}'.format(roc_auc))

print("Macro Average AUC:" + str(total/13))

### Attempt 2 Support Vector Machines (Linear Kernel):

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction import text
import pandas as pd

LABELS = ['No findings','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']

df = pd.read_csv("dataset.csv")
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train["Report"].values
X_test = test["Report"].values

ngram=4
minf=0.05
maxf=0.95
stopwords = text.ENGLISH_STOP_WORDS

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(sublinear_tf=True,norm='l2',stop_words=stopwords,
                                          strip_accents='ascii', lowercase=True, ngram_range=(1, ngram),
                                          min_df=minf, max_df=maxf)),
                ('clf', OneVsRestClassifier(svm.SVC(kernel='linear',gamma='scale')))])

total=0
for category in LABELS:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    fpr, tpr, _ = roc_curve(test[category], prediction)
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('Test auc is {}'.format(roc_auc))

print("Macro Average AUC:" + str(total/13))

### Attempt 3 Support Vector Machines (Radial Kernel):

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction import text
import pandas as pd

LABELS = ['No findings','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']

df = pd.read_csv("dataset.csv")
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train["Report"].values
X_test = test["Report"].values

ngram=4
minf=0.05
maxf=0.95
stopwords = text.ENGLISH_STOP_WORDS

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(sublinear_tf=True,norm='l2',stop_words=stopwords,
                                          strip_accents='ascii', lowercase=True, ngram_range=(1, ngram),
                                          min_df=minf, max_df=maxf)),
                ('clf', OneVsRestClassifier(svm.SVC(kernel='rbf',gamma='scale')))])

total=0
for category in LABELS:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    fpr, tpr, _ = roc_curve(test[category], prediction)
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('Test auc is {}'.format(roc_auc))

print("Macro Average AUC:" + str(total/13))

### Attempt 4 Multi Layer Perceptron:

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from keras_tqdm import TQDMNotebookCallback
from sklearn.metrics import roc_curve, auc
LABELS = ['No findings','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']

def create_model():
    # create model
    model = Sequential()
    model.add(Dense(256, input_dim=(100), activation='tanh'))
    model.add(Dropout(0.3))
    model.add(Dense(512, activation='tanh'))
    model.add(Dropout(0.3))
    model.add(Dense(1024, activation='tanh'))
    model.add(Dropout(0.3))
    model.add(Dense(512, activation='tanh'))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation='tanh'))
    model.add(Dropout(0.3))
    model.add(Dense(13, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

df = pd.read_csv("dataset.csv")
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train["Report"].values
X_test = test["Report"].values
Y_train = train[LABELS].values
Y_test = test[LABELS].values


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Report"].values)

print(X_train[0])
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=100, padding='post')
                 
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=100, padding='post')

model = create_model()

model.fit(X_train, Y_train,batch_size=128,validation_split=0.2, epochs=10,verbose=1)


total=0
prediction = model.predict(X_test)
for idx,category in enumerate(LABELS):
    fpr, tpr, _ = roc_curve(Y_test[:, idx], prediction[:, idx])
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('Test auc is {}'.format(roc_auc))

print("Macro Average AUC:" + str(total/13))

### Attempt 5 Convolution Network

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Input, Embedding
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import roc_curve, auc
LABELS = ['No findings','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']

def create_model(voc):
    sequence_input = Input(shape=(100,), dtype='int32')
    embedded_sequences = Embedding(voc, 64, input_length=100)(sequence_input)
    l_cov1= Conv1D(128, 3, activation='relu')(embedded_sequences)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(128, 3, activation='relu')(l_pool1)
    l_pool2 = MaxPooling1D(17)(l_cov2)  # global max pooling
    l_flat = Flatten()(l_pool2)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(13, activation='sigmoid')(l_dense)
    model = Model(inputs=sequence_input, outputs=preds)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

df = pd.read_csv("dataset.csv")
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train["Report"].values
X_test = test["Report"].values
Y_train = train[LABELS].values
Y_test = test[LABELS].values


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Report"].values)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=100, padding='post')
voc_size = len(tokenizer.word_index)+1
                 
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=100, padding='post')

print(voc_size)
model = create_model(voc_size)

model.fit(x=X_train, y=Y_train,batch_size=128,validation_split=0.2, epochs=10,verbose=1)


total=0
prediction = model.predict(X_test)
for idx,category in enumerate(LABELS):
    fpr, tpr, _ = roc_curve(Y_test[:, idx], prediction[:, idx])
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('Test auc is {}'.format(roc_auc))

print("Macro Average AUC:" + str(total/13))

# Attempt 6 Recurrent Neural Network

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Bidirectional, Flatten, Input, Embedding
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import roc_curve, auc
LABELS = ['No findings','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']

def create_model(voc):
    sequence_input = Input(shape=(100,), dtype='int32')
    embedded_sequences = Embedding(voc, 64, input_length=100)(sequence_input)
    rnn_layer = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2,recurrent_dropout=0.2),
                              merge_mode='concat')(embedded_sequences)
    l_flat = Flatten()(rnn_layer)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(13, activation='sigmoid')(l_dense)
    model = Model(inputs=sequence_input, outputs=preds)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

df = pd.read_csv("dataset.csv")
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train["Report"].values
X_test = test["Report"].values
Y_train = train[LABELS].values
Y_test = test[LABELS].values


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Report"].values)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=100, padding='post')
voc_size = len(tokenizer.word_index)+1
                 
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=100, padding='post')

model = create_model(voc_size)

model.fit(x=X_train, y=Y_train,batch_size=128,validation_split=0.2, epochs=10,verbose=1)


total=0
prediction = model.predict(X_test)
for idx,category in enumerate(LABELS):
    fpr, tpr, _ = roc_curve(Y_test[:, idx], prediction[:, idx])
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('Test auc is {}'.format(roc_auc))

print("Macro Average AUC:" + str(total/13))

# Now its time for you to create you own model
* Try to combine different Layers and different Parameters
* Add other layers
* The one with the best AUC value wins :)

## Your model:

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Bidirectional, Flatten, Input, Embedding,concatenate
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import roc_curve, auc


MAXLEN=100  #Number of words of the reports used (be carefull if the value is too large)
NUMBER_EPOCHS = 10 #Number of training iterations (be carefull if you train too much you will overfit the model)
BATCH_SIZE = 128 #How many training examples we use before updating the network weights (too much will burn your pc :) )
EMBEDDING_DIMENSION = 64 #The size of the vector that represents each word
LSTM_UNITS=32 #Number of Units
OPTIMIZER='adam' # 'sgd', 'rmsprop','adagrad','adadelta','adam','adamax','nadam'
INTERMEDIATE_ACTIVATION='relu' #'tanh','relu',''

def create_model(voc):
    layer1 = Input(shape=(MAXLEN,), dtype='int32')
    layer2 = Embedding(voc, EMBEDDING_DIMENSION, input_length=MAXLEN)(layer1)
    
    layer3a = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, dropout=0.2,recurrent_dropout=0.2),
                              merge_mode='concat')(layer2)
    
    layer3b = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, dropout=0.2,recurrent_dropout=0.2),
                              merge_mode='concat')(layer2)
    
    layer3 = concatenate([layer3a,layer3b])
    
    layer4 = Flatten()(layer3)
    
    layer5 = Dense(128, activation='relu')(layer4)
    layer6 = Dense(13, activation='sigmoid')(layer5)
    
    model = Model(inputs=layer1, outputs=layer6)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy'])
    model.summary()
    return model

df = pd.read_csv("dataset.csv")
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train["Report"].values
X_test = test["Report"].values
Y_train = train[LABELS].values
Y_test = test[LABELS].values


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Report"].values)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAXLEN, padding='post')
voc_size = len(tokenizer.word_index)+1
                 
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAXLEN, padding='post')

model = create_model(voc_size)

model.fit(x=X_train, y=Y_train,batch_size=BATCH_SIZE,validation_split=0.2, epochs=NUMBER_EPOCHS,verbose=1)


LABELS = ['No findings','Cardiomegaly','Airspace Opacity',
             'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis',
             'Pneumothorax','Pleural Effusion','Pleural Other','Fracture','Support Devices']
total=0
prediction = model.predict(X_test)
for idx,category in enumerate(LABELS):
    fpr, tpr, _ = roc_curve(Y_test[:, idx], prediction[:, idx])
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('Test auc is {}'.format(roc_auc))

print("Macro Average AUC:" + str(total/13))