<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/NER-Medical-Texts/ner_mediacal_functional_pipline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install -q datasets==3.6.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/491.5 kB[0m [31m32.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset

# Loading Dataset

In [3]:
dataset = load_dataset("tner/bc5cdr")

README.md: 0.00B [00:00, ?B/s]

bc5cdr.py: 0.00B [00:00, ?B/s]

0000.parquet:   0%|          | 0.00/367k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/364k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/386k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5228 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5330 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5865 [00:00<?, ? examples/s]

# Pipeline

In [13]:
# Split Dataset
def split_data(dataset, split):
    """Splits the dataset into sentences and tags for a given split.

    Args:
        dataset (Dataset): The loaded dataset.
        split (str): The name of the split (e.g., 'train', 'test', 'validation').

    Returns:
        tuple: A tuple containing lists of sentences and tags.
    """
    sent = dataset[split]['tokens']
    tags = dataset[split]['tags']
    return sent, tags


# Create Vocab
def create_vocab(train_sentences):
    """Creates a vocabulary from a list of training sentences.

    Args:
        train_sentences (list): A list of tokenized sentences.

    Returns:
        list: A sorted list of unique words (vocabulary).
    """
    all_train_words = [word for sent in train_sentences for word in sent]
    vocab = sorted(list(set(all_train_words)))
    return vocab


# Create Mapping index word2idx
def create_word_idx(vocab):
    """Creates a mapping from words to their integer indices.

    Args:
        vocab (list): The list of unique words (vocabulary).

    Returns:
        dict: A dictionary mapping words to integer indices.
    """
    word2idx = {w: i + 2 for i, w in enumerate(vocab)}
    word2idx["[UNK]"] = 1  # Unknown word token
    word2idx["[PAD]"] = 0  # Padding token
    return word2idx


def create_tag_idx(tags):
    """Creates mappings from tags to integer indices and vice-versa.

    Args:
        tags (list): A list of tag sequences (not directly used but kept for consistency).

    Returns:
        tuple: A tuple containing tag2idx (dict) and idx2tag (dict).
    """
    # Define the list of possible tags for medical named entity recognition
    tag_list = ["O", "B-Chm", "I-Chm", "B-Dis", "I-Dis"]
    tag2idx = {tag: i for i, tag in enumerate(tag_list)}
    idx2tag = {i: tag for i, tag in enumerate(tag_list)}
    return tag2idx, idx2tag


# Preprocessing & Padding
def preprocess_data(sentences, tags, word2idx, tag2idx, max_len):
    """Preprocesses sentences and tags by converting them to indices and padding them.

    Args:
        sentences (list): A list of tokenized sentences.
        tags (list): A list of tag sequences.
        word2idx (dict): Mapping from words to indices.
        tag2idx (dict): Mapping from tags to indices.
        max_len (int): The maximum sequence length for padding.

    Returns:
        tuple: A tuple containing padded sentence sequences (X_padded) and padded tag sequences (y_pad).
    """
    # Convert words in sentences to their corresponding integer indices
    X = [[word2idx.get(w, word2idx["[UNK]"]) for w in s] for s in sentences]
    # Pad sequences to a fixed length, post-padding, and truncate if longer
    X_padded = pad_sequences(
        X, maxlen=max_len, padding='post', truncating='post')
    # Convert tags to indices and pad them, using 'O' tag for padding
    y_pad = pad_sequences(
        tags, maxlen=max_len, padding='post', truncating='post', value=tag2idx["O"])
    return X_padded, y_pad


# Define model
def build_model(max_len, vocab_size, embedding_dim, units, n_output):
    """Builds a Bidirectional LSTM model for Named Entity Recognition.

    Args:
        max_len (int): Maximum sequence length.
        vocab_size (int): Size of the vocabulary.
        embedding_dim (int): Dimension of the word embeddings.
        units (int): Number of units in the LSTM layer.
        n_output (int): Number of output classes (tags).

    Returns:
        tf.keras.Model: The compiled Keras model.
    """
    model = models.Sequential([
        # Input layer with specified max length
        layers.Input(shape=(max_len,)),
        # Embedding layer to convert word indices to dense vectors
        layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
        # Bidirectional LSTM layer for sequence processing
        layers.Bidirectional(layers.LSTM(units=units, return_sequences=True)),
        # TimeDistributed Dense layer for per-timestep classification
        layers.TimeDistributed(layers.Dense(units=n_output, activation="softmax"))
    ])

    # Compile the model with Adam optimizer, sparse categorical crossentropy loss, and accuracy metric
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=[
            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        ]
    )

    # Print a summary of the model architecture
    model.summary()
    return model


def train(model, x, y, val, epochs, batch_size, callbacks):
    """Trains the given Keras model.

    Args:
        model (tf.keras.Model): The model to train.
        x (np.array): Training input features.
        y (np.array): Training target labels.
        val (tuple): A tuple (validation_features, validation_labels) for validation data.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        callbacks (list): List of Keras callbacks.

    Returns:
        tf.keras.callbacks.History: Training history object.
    """
    history = model.fit(
        x, y,
        validation_data=val,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks
    )
    return history


def predict_medical_ner(sentence):
    """Predicts medical named entities for a given sentence.

    Args:
        sentence (str): The input sentence to predict entities for.
    """
    tokens = sentence.split()
    # Convert tokens to numerical sequences using the word2idx mapping
    seq = [word2idx.get(w, word2idx["[UNK]"]) for w in tokens]
    # Pad the sequence to the maximum length used during training (50)
    pad = pad_sequences([seq], maxlen=50, padding="post")

    # Prediction
    preds = model.predict(pad, verbose=0)
    # Get the index of the most probable tag for each token
    pred_ids = np.argmax(preds, axis=-1)[0]

    print(f"\n🔍 Result for: {sentence}")
    print(f"{'Token':<15} | {'Predicted Tag'}")
    print("-" * 30)
    # Print each token and its predicted tag
    for i, token in enumerate(tokens):
        if i < 50:  # Ensure we don't go beyond max_len
            tag = idx2tag[pred_ids[i]]
            print(f"{token:<15} | {tag}")


In [14]:
def main():
    # Declare global variables to make them accessible from predict_medical_ner
    global word2idx, idx2tag, model

    #--------------------------#
    # Get data
    train_sent, train_tags = split_data(dataset, "train")
    test_sent, test_tags = split_data(dataset, "test")
    val_sent, val_tags = split_data(dataset, "validation")
    print(f"Train Sentences: {len(train_sent)}\t Tags: {len(train_tags)}")
    print(f"Test Sentences: {len(test_sent)}\t Tags: {len(test_tags)}")
    print(f"Val Sentences: {len(val_sent)}\t Tags: {len(val_tags)}")


    #--------------------------#
    # Create Vocab
    vocab = create_vocab(train_sent)
    print(f"Len Vocab: {len(vocab)}")

    #--------------------------#
    # Create Word & Tag Mapping
    tag2idx, idx2tag = create_tag_idx(train_tags)
    word2idx = create_word_idx(vocab)

    # Preprocess Train/Test/Val
    X_train, y_train = preprocess_data(train_sent, train_tags, word2idx, tag2idx, 50)
    X_test, y_test = preprocess_data(test_sent, test_tags, word2idx, tag2idx, 50)
    X_val, y_val = preprocess_data(val_sent, val_tags, word2idx, tag2idx, 50)

    #--------------------------#
    # Create Model
    model = build_model(50, len(word2idx), 100, 512, len(tag2idx))

    #--------------------------#
    # Training
    # Define Callbacks
    from tensorflow.keras import callbacks
    callbacks = [
        callbacks.EarlyStopping(
            monitor="val_loss", patience=5
            ),
        callbacks.ModelCheckpoint(
            filepath="model.weights.h5", save_best_only=True, save_weights_only=True
        ),
        callbacks.ReduceLROnPlateau(
            monitor="val_loss", factor=0.1, patience=0
            )
    ]
    # Fitting
    history = train(model, X_train, y_train, (X_val, y_val), 10, 32, callbacks)

    # Prediction
    predict_medical_ner("Aspirin can cause stomach bleeding in some patients")
    predict_medical_ner("The patient developed a severe headache after taking ibuprofen.")
    predict_medical_ner("Chemotherapy can cause nausea and hair loss.")
    predict_medical_ner("Diabetes is a chronic condition characterized by high blood sugar.")


# Execution
main()

Train Sentences: 5228	 Tags: 5228
Test Sentences: 5865	 Tags: 5865
Val Sentences: 5330	 Tags: 5330
Len Vocab: 9926


Epoch 1/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 46ms/step - accuracy: 0.9457 - loss: 0.6064 - val_accuracy: 0.9675 - val_loss: 0.2465 - learning_rate: 0.0010
Epoch 2/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9802 - loss: 0.1443 - val_accuracy: 0.9724 - val_loss: 0.2169 - learning_rate: 0.0010
Epoch 3/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 44ms/step - accuracy: 0.9902 - loss: 0.0763 - val_accuracy: 0.9791 - val_loss: 0.1960 - learning_rate: 0.0010
Epoch 4/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - accuracy: 0.9937 - loss: 0.0470 - val_accuracy: 0.9776 - val_loss: 0.2312 - learning_rate: 0.0010
Epoch 5/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.9965 - loss: 0.0275 - val_accuracy: 0.9799 - val_loss: 0.2242 - learning_rate: 1.0000e-04
Epoch 6/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3