In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('data.csv')

In [2]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
def text_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
    
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

def remove_bracket(text):
    return re.sub('\[[^]]*\]', '', text)
    
def remove_special_characters(text):
    text = text.replace('-', ' ')  
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r"\n+", " ", text)
    return text

def remove_placeholders(text):
    return re.sub(r"\[\*\*.*?\*\*\]", "", text) 


def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    word_tokens = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(word_tokens) # rejoin tokens into a single string
    return text

In [4]:
def clean_text(text):
    text = text_lowercase(text)
    text = remove_placeholders(text)
    text = remove_bracket(text)
    text = remove_numbers(text)
    text = remove_special_characters(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text) 
    return text       

In [5]:
data['TEXT'] = data['TEXT'].apply(clean_text)

In [6]:
def multi_label_accuracy(y_true, y_pred):
    """
    Calculate multi-label accuracy.
    
    Args:
        y_true (pd.DataFrame or np.ndarray): True binary labels.
        y_pred (pd.DataFrame or np.ndarray): Predicted binary labels.
        
    Returns:
        float: Multi-label accuracy.
    """
    # Convert DataFrame to NumPy array if needed
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame):
        y_pred = y_pred.values
    
    n_samples = y_true.shape[0]
    accuracy_sum = 0.0
    
    for i in range(n_samples):
        true_set = set(np.where(y_true[i])[0])
        pred_set = set(np.where(y_pred[i])[0])
        
        intersection = len(true_set & pred_set)
        union = len(true_set | pred_set)
        
        if union == 0:
            accuracy_sum += 1.0
        else:
            accuracy_sum += intersection / union
    
    return accuracy_sum / n_samples

In [7]:
X = data['TEXT']
y = data.iloc[:, 3:]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print("Number of Observations in the Data set:" , X.values.shape[0]) 
print("Number of Observations in Train set:", X_train.shape[0])
print("Number of Observations in Test set:" , X_test.shape[0])   

Number of Observations in the Data set: 48335
Number of Observations in Train set: 33834
Number of Observations in Test set: 14501


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [10]:
X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()

## ANN Chain Model

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from time import time

In [13]:
def create_c1node(X_feed,y_now):
    '''
    C1 node Architecture:
    attribute:512:256:1 [saperate for each class]
    loss: Binary crossentropy
    '''
    model = Sequential()
    model.add(Dense(512,activation='relu',input_shape=(X_feed.shape[1],),kernel_initializer='glorot_uniform',name='first'))
    model.add(Dropout(0.4))
    model.add(Dense(256,activation='relu',kernel_initializer='glorot_uniform',name='nretrain1'))
    model.add(Dropout(0.4))
    model.add(Dense(128,activation='relu',kernel_initializer='glorot_uniform',name='nretrain2'))
    model.add(Dropout(0.4))
    model.add(Dense(64,activation='relu',kernel_initializer='glorot_uniform',name='last'))
    model.add(Dropout(0.4))    
    model.add(Dense(y_now.shape[1],activation='sigmoid',kernel_initializer='glorot_uniform',name='output'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [14]:
labels = [18, 2, 10, 15, 8, 5, 4, 19, 14, 7, 16, 11, 13, 9, 17, 6, 1, 12, 0, 3]

In [15]:
# Model chain
chain = []

# Training
X_feed = X_train_dense.copy()
t1 = time()

# Loop through the label indices based on the custom order
for i in labels:
    print("Training chain node ", i)
    y_now = y_train.iloc[:, [i,]].copy()
    print("Shapes:\n X = {} \n Y = {}".format(X_feed.shape, y_now.shape))

    # Create and train the node for each label
    node = create_c1node(X_feed, y_now)
    node.fit(X_feed, y_now, epochs=8, batch_size=64)
    print("Training of node {} complete\n\n".format(i))  
    
    # Add node to chain 
    chain.append(node)
    
    # Now append y_now to X_feed
    X_feed = np.append(X_feed, y_now, axis=1)

t2 = time()
print("Time taken: ", (t2 - t1))

Training chain node  18
Shapes:
 X = (33834, 10000) 
 Y = (33834, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 75ms/step - acc: 0.7766 - loss: 0.4639
Epoch 2/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - acc: 0.8699 - loss: 0.3114
Epoch 3/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - acc: 0.8997 - loss: 0.2513
Epoch 4/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 72ms/step - acc: 0.9287 - loss: 0.1808
Epoch 5/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - acc: 0.9587 - loss: 0.1121
Epoch 6/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - acc: 0.9768 - loss: 0.0667
Epoch 7/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 76ms/step - acc: 0.9816 - loss: 0.0528
Epoch 8/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - acc: 0.9860 - loss: 0.0404
Training of node 18 complete


Training chain node  2
Shapes:
 X = (3383

In [16]:
y_pred = []

X_feed = X_test_dense.copy()

for i, node in enumerate(chain):
    print("Getting output from chain node ", i)
    node_name = "Node" + str(i)
    print("Shapes:\n X = {}".format(X_feed.shape))
    output = node.predict(X_feed)
    output = output.round().astype(int)
    y_pred.append(output)
    X_feed = np.append(X_feed, output, axis=1)
    #i+=1

Getting output from chain node  0
Shapes:
 X = (14501, 10000)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step
Getting output from chain node  1
Shapes:
 X = (14501, 10001)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step
Getting output from chain node  2
Shapes:
 X = (14501, 10002)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step
Getting output from chain node  3
Shapes:
 X = (14501, 10003)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step
Getting output from chain node  4
Shapes:
 X = (14501, 10004)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step
Getting output from chain node  5
Shapes:
 X = (14501, 10005)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step
Getting output from chain node  6
Shapes:
 X = (14501, 10006)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step
Getting output from chain node  7
Shape

In [17]:
y_test.iloc[:, labels]

Unnamed: 0,ICD9_518,ICD9_E87,ICD9_285,ICD9_V30,ICD9_585,ICD9_584,ICD9_401,ICD9_276,ICD9_038,ICD9_599,ICD9_530,ICD9_V10,ICD9_250,ICD9_V45,ICD9_272,ICD9_414,ICD9_427,ICD9_428,ICD9_V58,ICD9_403
0,1,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0
2,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,1,0,0,1,0,0,0,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14496,1,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,0,0,0
14497,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
14498,0,0,1,0,1,1,1,0,0,1,1,0,0,0,0,0,1,0,0,0
14499,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
predictions = np.array([0])
for i in y_pred:
    x = np.array(i)
    if predictions.shape == (1,):
        predictions = x
    else:
        predictions = np.append(predictions, x, axis=1)

In [19]:
y_test_reordered = y_test.iloc[:, labels]

In [21]:
from sklearn.metrics import hamming_loss, log_loss, f1_score, accuracy_score, jaccard_score, classification_report

print("Accuracy = ", accuracy_score(y_test_reordered, predictions))
print("Jaccard Score (micro) = ", jaccard_score(y_test_reordered, predictions, average='micro'))
print("Jaccard Score (macro) = ", jaccard_score(y_test_reordered, predictions, average='macro'))

print("Hamming loss = ", hamming_loss(y_test_reordered, predictions))
#print("Log loss = ", log_loss(y_test, predictions))
print("F1 score (micro) = ", f1_score(y_test_reordered, predictions, average='micro'))
print("F1 score (macro) = ", f1_score(y_test_reordered, predictions, average='macro'))

ann_cc_ml_accuracy = multi_label_accuracy(y_test_reordered, predictions)
print(f'Multi Label Accuracy: {ann_cc_ml_accuracy:.2f}')  

print("\nClassification Report: ", classification_report(y_test_reordered, predictions))

Accuracy =  0.12337080201365423
Jaccard Score (micro) =  0.4990411250799062
Jaccard Score (macro) =  0.49703495786940194
Hamming loss =  0.13780773739742086
F1 score (micro) =  0.6658137882018479
F1 score (macro) =  0.6483944664707639
Multi Label Accuracy: 0.52

Classification Report:                precision    recall  f1-score   support

           0       0.71      0.61      0.66      3856
           1       0.46      0.27      0.34      1921
           2       0.51      0.55      0.53      3766
           3       0.96      1.00      0.98       714
           4       0.71      0.62      0.66      2018
           5       0.79      0.63      0.70      3406
           6       0.71      0.73      0.72      6170
           7       0.57      0.60      0.59      4420
           8       0.70      0.63      0.66      1853
           9       0.60      0.57      0.59      2203
          10       0.68      0.50      0.57      2157
          11       0.60      0.40      0.48      1879
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
