In [44]:

from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

import pandas as pd

import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
from keras.utils import to_categorical
from keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Input
from keras.models import Model

### Necessary Functions

In [46]:
def get_sentences(dataset):
    n_sent = 1
    # print(dataset[0])
    grouped = dataset.groupby("sentence_idx").apply(lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                                                      s["tag"].values.tolist())])
    sentences = [s for s in grouped]

    def get_next():
        nonlocal n_sent
        try:
            s = grouped["Sentence: {}".format(n_sent)]
            n_sent += 1
            return s
        except:
            return None

    return sentences, get_next


def get_max_len(sentences):
    return max([len(s) for s in sentences])


def add_sentence_id_column(dataset):
    sentence_idx = 1
    sentence_indices = []
    for word in dataset['word']:
        sentence_indices.append(sentence_idx)
        if word == '.':
            sentence_idx += 1
    dataset.insert(0, 'sentence_idx', sentence_indices)
    return dataset

In [47]:
tag2idx={'B-STREET': 0,
 'I-STATE': 1,
 'IDNUM': 2,
 'I-PER': 3,
 'B-MEDICALRECORD': 4,
 'I-HOSPITAL': 5,
 'I-PROFESSION': 6,
 'USERNAME': 7,
 'DATE': 8,
 'FAX': 9,
 'I-MEDICALRECORD': 10,
 'CITY': 11,
 'B-PER': 12,
 'O': 13,
 'B-STATE': 14,
 'ZIP': 15,
 'COUNTRY': 16,
 'B-ORGANIZATION': 17,
 'EMAIL': 18,
 'B-HOSPITAL': 19,
 'I-STREET': 20,
 'B-PROFESSION': 21,
 'I-ORGANIZATION': 22,
 'PHONE': 23,
 'I-AGE': 24,
 'B-AGE': 25}

n_tags = len(tag2idx.values())
def encoding(data):
    # dframe = lstmFun.add_sentence_id_column(self.data)
    sentences, get_next = get_sentences(data)
    maxlen = get_max_len(sentences)
    print('Maximum sequence length:', maxlen)

    words = list(set(data["word"].values))
    words = ["ENDPAD"] + words
    n_words = len(words);
    n_tags = len(tag2idx.values())

    word2idx = {w: i for i, w in enumerate(words)}

    x = [[word2idx[w[0]] for w in s] for s in sentences]
    x = pad_sequences(maxlen=maxlen, sequences=x, padding="post", value=n_words - 1)

    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]
    return x, y, maxlen, n_words

### encoding train and test data

In [48]:


train_data = pd.read_csv("work/data/health/filtered_i2b2_train_data4.csv", encoding = "ISO-8859-1")
test_data = pd.read_csv("work/data/health/filtered_i2b2_test_data4.csv", encoding = "ISO-8859-1")
x_train, y_train, train_maxlen, train_n_words = encoding(train_data)
x_test, y_test, test_maxlen, test_n_words= encoding(test_data)

Maximum sequence length: 99
Maximum sequence length: 99


### Train the model

In [49]:
input_t = Input(shape=(train_maxlen,))
model = Embedding(input_dim=train_n_words, output_dim=100, input_length=train_maxlen)(input_t)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
# model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
# crf = CRF(n_tags)  # CRF layer with n_tags output units
# out = crf(model)  # Apply CRF layer to the output of LSTM
model = Model(input_t, out)

In [50]:
def custom_loss(y_true, y_pred):
    penalty_weight = tf.constant(10.0)

    # Calculate the loss
    loss = - tf.reduce_sum(y_true * tf.math.log(y_pred), axis=-1)
    print("loss", loss.shape)

    mask = tf.equal(tf.equal(y_true[:, :, 0] , 1), tf.less(y_pred[:, :, 0] , 0.7))
    mask = tf.cast(mask, tf.float32)
    print("mask", mask)

    loss = loss * tf.multiply(penalty_weight, mask)

    return loss


cross_entropy_loss = CategoricalCrossentropy()
model = Model(input_t, out)
model.compile(optimizer="adam", loss=cross_entropy_loss, metrics=["categorical_accuracy"])

In [51]:
 # custom_loss(y_test, preds)

In [52]:
history = model.fit(x_train, np.array(y_train), batch_size=32, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [53]:
# Save Model
dir = "/home/jovyan/work/data/output/LSTM_health_5"
model.save(dir)

2023-06-18 21:12:01.076096: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,100]
	 [[{{node while/Placeholder_2}}]]
2023-06-18 21:12:01.136107: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,100]
	 [[{{node while/Placeholder_2}}]]
2023-06-18 21:12:01.154126: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and 

INFO:tensorflow:Assets written to: /home/jovyan/work/data/output/LSTM_health_s4/assets


INFO:tensorflow:Assets written to: /home/jovyan/work/data/output/LSTM_health_s4/assets


In [54]:
# Evaluation
loss, accuracy = model.evaluate(x_test, np.array(y_test))
print("Loss: {:.4f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy * 100))

Loss: 6.3344
Accuracy: 2.36%


### Evaluation per tag

In [55]:
# Read Model
# lstm_1_epochs = 'work/data/output/LSTM_1batch'
# model = tf.keras.models.load_model(lstm_1_epochs)

In [64]:
# Get the predicted labels
y_pred = model.predict(x_test)
y_pred_indices = np.argmax(y_pred, axis=-1)

y_pred_flattened = y_pred_indices.flatten()
# Y_train_flatten = y_train_indices.flatten()



In [65]:
unique_values, counts = np.unique(y_pred_flattened, return_counts=True)

for value, count in zip(unique_values, counts):
    print("Value:", value, "Count:", count)

Value: 0 Count: 33601
Value: 1 Count: 249361
Value: 2 Count: 69
Value: 5 Count: 17480
Value: 7 Count: 51
Value: 8 Count: 47413
Value: 9 Count: 1
Value: 10 Count: 1101
Value: 12 Count: 1121
Value: 13 Count: 6233
Value: 14 Count: 987
Value: 15 Count: 11539
Value: 16 Count: 857
Value: 17 Count: 275
Value: 18 Count: 58322
Value: 19 Count: 20683
Value: 21 Count: 1013
Value: 22 Count: 3934
Value: 23 Count: 264
Value: 24 Count: 699


In [66]:
y_true_indices = np.argmax(y_test, axis=-1)
y_true_flattened = y_true_indices.flatten()

In [67]:
print(y_true_flattened)
unique_values, counts = np.unique(y_true_flattened, return_counts=True)

for value, count in zip(unique_values, counts):
    print("Value:", value, "Count:", count)

[15 15 15 ... 15 15 15]
Value: 0 Count: 324
Value: 1 Count: 159
Value: 2 Count: 2314
Value: 3 Count: 532
Value: 4 Count: 19
Value: 5 Count: 83
Value: 6 Count: 386
Value: 7 Count: 288
Value: 8 Count: 2
Value: 9 Count: 6
Value: 10 Count: 74
Value: 11 Count: 192
Value: 12 Count: 47
Value: 13 Count: 727
Value: 14 Count: 19
Value: 15 Count: 436465
Value: 16 Count: 133
Value: 17 Count: 137
Value: 18 Count: 625
Value: 19 Count: 747
Value: 20 Count: 340
Value: 21 Count: 78
Value: 22 Count: 7600
Value: 23 Count: 2690
Value: 24 Count: 866
Value: 25 Count: 151


In [68]:
precision_per_tag = {}
recall_per_tag = {}
f1_per_tag = {}
for tag, id in tag2idx.items():
    true_positives = np.sum((y_pred_flattened == id) & (y_true_flattened == id))
    false_positives = np.sum((y_pred_flattened == id) & (y_true_flattened != id))
    false_negatives = np.sum((y_pred_flattened != id) & (y_true_flattened == id))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    precision_per_tag[tag] = round(precision, 2) * 100
    recall_per_tag[tag] = round(recall, 2) * 100
    f1_per_tag[tag] = round(f1_score, 2) * 100

for tag, id in tag2idx.items():
    print("Tag:", tag)
    print("Precision: ", precision_per_tag[tag])
    print("F1 Score:", f1_per_tag[tag])
    print("recall:", recall_per_tag[tag])
    print()



Tag: IDNUM
Precision:  0.0
F1 Score: 1.0
recall: 31.0

Tag: ZIP
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: I-PER
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: PHONE
Precision:  0
F1 Score: 0
recall: 0.0

Tag: FAX
Precision:  0
F1 Score: 0
recall: 0.0

Tag: USERNAME
Precision:  0.0
F1 Score: 0.0
recall: 5.0

Tag: B-MEDICALRECORD
Precision:  0
F1 Score: 0
recall: 0.0

Tag: I-STREET
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: EMAIL
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: I-STATE
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: I-ORGANIZATION
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: B-STATE
Precision:  0
F1 Score: 0
recall: 0.0

Tag: COUNTRY
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: B-AGE
Precision:  0.0
F1 Score: 1.0
recall: 4.0

Tag: I-AGE
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: O
Precision:  91.0
F1 Score: 5.0
recall: 2.0

Tag: I-PROFESSION
Precision:  0.0
F1 Score: 0.0
recall: 2.0

Tag: B-PROFESSION
Precision:  0.0
F1 Score: 0
recall: 0.0

Tag: I-MEDICALRECORD
Pr

In [41]:
# Calculate overall precision, recall, and F1 score
true_positives = np.sum((y_pred_flattened == y_true_flattened) & (y_true_flattened != 0))
false_positives = np.sum((y_pred_flattened != y_true_flattened) & (y_true_flattened != 0))
false_negatives = np.sum((y_pred_flattened != y_true_flattened) & (y_true_flattened == 0))

precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1_score))

Precision: 0.01
Recall: 0.98
F1 Score: 0.03


#### Read models and  Compair two models with different epoch

In [42]:
import tensorflow as tf
lstm_1_epochs = 'work/data/output/LSTM_1batch'
model1e = tf.keras.models.load_model(lstm_1_epochs)


OSError: No file or directory found at work/data/output/LSTM_1batch

In [20]:
loss, accuracy = model1e.evaluate(x_test, np.array(y_test))
print("Loss: {:.4f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy * 100))

Loss: 13.4553
Accuracy: 0.38%


In [29]:
lstm_40_epochs = 'work/data/output/LSTM_'
model40e = tf.keras.models.load_model(lstm_40_epochs)
loss, accuracy = model1e.evaluate(x_test, np.array(y_test))
print("Loss: {:.4f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy * 100))


Loss: 0.0254
Accuracy: 99.27%
