In [None]:
!pip install matplotlib
!pip install imbalanced-learn
!pip install --upgrade ipywidgets
!pip install optuna
!pip install transformers --upgrade
!pip install accelerate --upgrade
!pip install gensim
!pip install nltk
!pip install datasets

Installing the required libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from tqdm import tqdm
import transformers
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve, auc
from sklearn.svm import SVC
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import nltk
nltk.download('punkt')




Implementation of LSTM model

In [79]:
# Combine the tokenized text from your data_frames
all_text = np.concatenate([df_train['text'], df_validation['text'], df_test['text']])

In [80]:
# Create a tokenizer and convert text to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text)

X_train_seq = tokenizer.texts_to_sequences(df_train['text'])
X_valid_seq = tokenizer.texts_to_sequences(df_validation['text'])
X_test_seq = tokenizer.texts_to_sequences(df_test['text'])


In [81]:
# Padding sequences to have the same length
max_sequence_length = max(map(len, X_train_seq + X_valid_seq + X_test_seq))
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_valid_padded = pad_sequences(X_valid_seq, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')


In [82]:

# Define the embedding dimension
embedding_dim = 50  # Reduced embedding dimension

# Define smaller LSTM units
lstm_units = 64  # Reduced number of units

# Create a Sequential model
lstm_model = Sequential()

# Add an Embedding layer with reduced dimensions
lstm_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))

# Add a single LSTM layer with reduced units and no return sequences
lstm_model.add(LSTM(lstm_units))

# Add a Dropout layer to prevent overfitting
lstm_model.add(Dropout(0.3))  # Reduced dropout rate

# Add a Dense layer for the final classification, using sigmoid activation for binary classification
lstm_model.add(Dense(1, activation='sigmoid'))

# Compile the model with binary cross-entropy loss and the Adam optimizer
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [83]:
# Train the LSTM model
lstm_model.fit(X_train_padded, y_train, validation_data=(X_valid_padded, y_valid), epochs=4, batch_size=8)


Epoch 1/4
Epoch 2/4
Epoch 3/4

The model crashed with 4 epoch batch size 8 in the middle of 3rd epoch and thus we executed 5 epoch 16 batch size on the google collab using GPU.

In [None]:
# Evaluate the LSTM model
y_valid_pred_lstm = lstm_model.predict_classes(X_valid_padded)
valid_accuracy = accuracy_score(y_valid, y_valid_pred_lstm)
print("Test Accuracy:", valid_accuracy)
print("Test Classification Report:\n", classification_report(y_valid, y_valid_pred_lstm))

In [None]:
# Evaluate the LSTM model
y_test_pred_lstm = lstm_model.predict_classes(X_test_padded)
test_accuracy = accuracy_score(y_test, y_test_pred_lstm)
print("Test Accuracy:", test_accuracy)
print("Test Classification Report:\n", classification_report(y_test, y_test_pred_lstm))

Implementation of Clinical BERT model

In [None]:

from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments

# Load Clinical BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# Convert the datasets to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(df_train_bert)
test_dataset = Dataset.from_pandas(df_test_bert)
val_dataset = Dataset.from_pandas(df_validation_bert)


In [None]:
def tokenize_function(examples):
    if not all(isinstance(item, str) for item in examples['text']):
        problematic_items = [item for item in examples['text'] if not isinstance(item, str)]
        print(f"Non-string items: {problematic_items}")
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)


In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


In [None]:

# Set the format to torch tensors
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,      # Reduce batch size for memory efficiency
    per_device_eval_batch_size=8,       # Keep it similar to per_device_train_batch_size
    num_train_epochs=4,                 # Train for 3 epochs
    evaluation_strategy="steps",
    save_steps=1000,                    # Save checkpoints less frequently
    eval_steps=1500,                    # Evaluate less frequently
    logging_steps=500,                  # Log less frequently
    learning_rate=3e-5,                 # Slightly higher learning rate for faster convergence
    warmup_steps=300,                   # Gradually warm up the learning rate
    weight_decay=0.01,                  # Apply L2 regularization
    output_dir='./results',
    logging_dir='./logs',
    logging_first_step=False,           # No need to log the very first step
    gradient_accumulation_steps=4,      # Further reduce memory usage with gradient accumulation
)


In [None]:
# Compute metrics function for evaluation

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [None]:
# Train the model with built-in progress monitoring and logging
trainer.train()

In [None]:
# Evaluate the model on validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation Results:", eval_results)


In [None]:
# Evaluate the model on test dataset
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

In [None]:
# Extract and display the accuracy
accuracy = test_results['eval_accuracy']
print(f"Accuracy on test dataset: {accuracy:.2f}")


In [None]:
import itertools
# Define a list of hyperparameter combinations to try
learning_rates = [1e-5, 3e-5, 1e-4]
batch_sizes = [4, 8, 16]
num_epochs = [2, 4, 6]
weight_decays = [0.0, 0.1, 0.2]

hyperparameter_combinations = list(itertools.product(learning_rates, batch_sizes, num_epochs, weight_decays))


In [None]:
# Define evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Loop through hyperparameter combinations
for learning_rate, batch_size, num_train_epochs, weight_decay in hyperparameter_combinations:
    training_args = TrainingArguments(
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        evaluation_strategy="steps",
        save_steps=1000,
        eval_steps=1500,
        logging_steps=500,
        learning_rate=learning_rate,
        warmup_steps=300,
        weight_decay=weight_decay,
        output_dir='./results',
        logging_dir='./logs',
        logging_first_step=False,
        gradient_accumulation_steps=4,
    )

In [None]:
# Create Trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model on validation set
    eval_results = trainer.evaluate(eval_dataset=val_dataset)
    print("Hyperparameters:", learning_rate, batch_size, num_train_epochs, weight_decay)
    print("Validation Results:", eval_results)

In [None]:
# Evaluate the model on test dataset
    test_results = trainer.evaluate(test_dataset)
    print("Test Results:", test_results)