In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import tensorflow_hub as hub

In [None]:
dataset = pd.read_csv("cleaned_dataset.csv")
dataset

In [None]:
augmented_dataset = pd.read_csv("cleaned_augmented_dataset.csv")
augmented_dataset = augmented_dataset.drop(columns=['Unnamed: 0.1'])
augmented_dataset

In [None]:
frames = [dataset, augmented_dataset]
dataset = pd.concat(frames)
dataset

In [None]:
max_length = max(dataset['text'].astype(str), key=len)
max_length = max_length.split()
len(max_length)

In [None]:
dataset_label = dataset.drop(columns=["Unnamed: 0", "id", "text"])
dataset_label

In [None]:
label_list = []
for index, row in dataset_label.iterrows():
    label = []
    for (columnName, columnData) in dataset_label.iteritems():
        if row[columnName] == 1:
            label.append(columnName)
    label_list.append(label)

In [None]:
label_list

In [None]:
dataset = dataset.drop(columns=["Pasien", "Usia pasien", "Penyakit", "Gejala", "Tindakan", "Outcome", "Pertanyaan", "Pembuka", "Penyebab", "Prakondisi", "Objek", "Penutup", "Waktu", "Unnamed: 0"])
dataset['Label'] = label_list
dataset

In [None]:
X_train, X_val, y_train, y_val = train_test_split(dataset['text'], dataset['Label'], test_size=0.2, random_state=42)
print("Number of text for training: ", len(X_train))
print("Number of text for validation: ", len(X_val))

In [None]:
X_train

In [None]:
y_train

In [None]:
y_train = list(y_train)
y_val = list(y_val)
y_train[:3]

In [None]:
# Fit the multi-label binarizer on the training set
print("Labels:")
mlb = MultiLabelBinarizer()
mlb.fit(y_train)

# Loop over all labels and show them
N_LABELS = len(mlb.classes_)
for (i, label) in enumerate(mlb.classes_):
    print("{}. {}".format(i, label))

In [None]:
N_LABELS

In [None]:
# transform the targets of the training and test sets
y_train_bin = mlb.transform(y_train)
y_val_bin = mlb.transform(y_val)

In [None]:
X_val

In [None]:
# Parameters
vocab_size = 10000
max_length = 150
trunc_type='post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Generate the word index dictionary for the training sentences
tokenizer.fit_on_texts(X_train.astype('str'))
word_index = tokenizer.word_index

# Generate and pad the training sequences
sequences = tokenizer.texts_to_sequences(X_train.astype('str'))
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

# Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(X_val.astype('str'))
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [None]:
# Hyperparameters
embedding_dim = 64
lstm1_dim = 64
lstm2_dim = 32
dense_dim = 64

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm1_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm2_dim)),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(N_LABELS, activation='sigmoid')
])

# Print the model summary
model.summary()

In [None]:
# Set the training parameters
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              metrics=['accuracy'])

In [None]:
NUM_EPOCHS = 10
BATCH_SIZE = 32

# Train the model
history_lstm = model.fit(padded, y_train_bin, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(testing_padded, y_val_bin))

In [None]:
@tf.function
def macro_soft_f1(y, y_hat):
    """Compute the macro soft F1-score as a cost (average 1 - soft-F1 across all labels).
    Use probability values instead of binary predictions.

    Args:
        y (int32 Tensor): targets array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)

    Returns:
        cost (scalar Tensor): value of the cost function for the batch
    """
    y = tf.cast(y, tf.float32)
    y_hat = tf.cast(y_hat, tf.float32)
    tp = tf.reduce_sum(y_hat * y, axis=0)
    fp = tf.reduce_sum(y_hat * (1 - y), axis=0)
    fn = tf.reduce_sum((1 - y_hat) * y, axis=0)
    soft_f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    cost = 1 - soft_f1 # reduce 1 - soft-f1 in order to increase soft-f1
    macro_cost = tf.reduce_mean(cost) # average on all labels
    return macro_cost

In [None]:
@tf.function
def macro_f1(y, y_hat, thresh=0.5):
    """Compute the macro F1-score on a batch of observations (average F1 across labels)

    Args:
        y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        thresh: probability value above which we predict positive

    Returns:
        macro_f1 (scalar Tensor): value of macro F1 for the batch
    """
    y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y), axis=0), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y, axis=0), tf.float32)
    f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    macro_f1 = tf.reduce_mean(f1)
    return macro_f1

In [None]:
LR = 1e-4
EPOCHS = 10

# Compile the model to configure the training process.
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
    loss=macro_soft_f1,
    metrics=[macro_f1, 'accuracy'])

In [None]:
history = model.fit(padded, y_train_bin,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(testing_padded, y_val_bin))

In [None]:
# Plot Utility
def plot_graphs(graph, string):
    plt.plot(graph.history[string])
    plt.plot(graph.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

# Plot the accuracy and loss history
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

# Transfer Learning

In [None]:
BATCH_SIZE = 128 # Big enough to measure an F1-score
AUTOTUNE = tf.data.experimental.AUTOTUNE # Adapt preprocessing and prefetching dynamically
SHUFFLE_BUFFER_SIZE = 512 # Shuffle the training data by a chunck of 1024 observations

def create_dataset(filenames, labels, is_training=True):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """
    filenames = np.asarray(filenames).astype('str')

    # Create a first dataset of file paths and labels
    new_dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    # Parse and preprocess observations in parallel

    if is_training:
        # This is a small dataset, only load it once, and keep it in memory.
        new_dataset = new_dataset.cache()
        # Shuffle the data each buffer size
        new_dataset = new_dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)

    # Batch the data for multiple steps
    new_dataset = new_dataset.batch(BATCH_SIZE)
    # Fetch batches in the background while the model is training.
    new_dataset = new_dataset.prefetch(buffer_size=AUTOTUNE)

    return new_dataset

In [None]:
train_ds = create_dataset(X_train, y_train_bin)
val_ds = create_dataset(X_val, y_val_bin)

In [None]:
feature_extractor_url = "https://tfhub.dev/google/nnlm-id-dim128/2"

In [None]:
feature_extractor_layer = hub.KerasLayer(feature_extractor_url,
                                         input_shape=[],
                                         dtype=tf.string, output_shape=[64])

In [None]:
feature_extractor_layer.trainable = False

In [None]:
# Hyperparameters
lstm1_dim = 64
lstm2_dim = 32
dense_dim = 5112

model_trf = tf.keras.Sequential([
    feature_extractor_layer,
    # tf.keras.layers.Reshape((64, 2), input_shape=(50,)),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm1_dim, return_sequences=True)),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm2_dim)),
    # tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(N_LABELS, activation='sigmoid')
])

model_trf.summary()

In [None]:
LR = 1e-5  # Keep it small when transfer learning
EPOCHS = 200

# Compile the model to configure the training process.
model_trf.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
    loss=macro_soft_f1,
    metrics=[macro_f1, 'accuracy'])

In [None]:
transfer_learning_lstm = model_trf.fit(train_ds,
                                   epochs=EPOCHS,
                                   validation_data=create_dataset(X_val, y_val_bin))

In [None]:
# Plot Utility
def plot_graphs(graph, string):
    plt.plot(graph.history[string])
    plt.plot(graph.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

# Plot the accuracy and loss history
plot_graphs(transfer_learning_lstm, 'accuracy')
plot_graphs(transfer_learning_lstm, 'loss')