In [None]:

!pip install tensorflow scikit-learn pandas matplotlib -q

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount('/content/gdrive')

# Load dataset
data = pd.read_csv('/content/gdrive/MyDrive/datamining/clinical_notes.csv')  # path



# Data Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
MAX_WORDS = 5000
MAX_SEQ_LENGTH = 500

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(data['note'])
sequences = tokenizer.texts_to_sequences(data['note'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')


# Encode labels
label_map = {'yes': 1, 'no': 0}
data['glaucoma'] = data['glaucoma'].map(label_map)

# Split data based on 'use' column
train_indices = data['use'] == 'training'
validation_indices = data['use'] == 'validation'
test_indices = data['use'] == 'test'



X_train, y_train = padded_sequences[train_indices], data['glaucoma'].values[train_indices]
X_val, y_val = padded_sequences[validation_indices], data['glaucoma'].values[validation_indices]
X_test, y_test = padded_sequences[test_indices], data['glaucoma'].values[test_indices]

race_test = data['race'].values[test_indices]  # For racial race evaluation


# 4. Define Models
def build_lstm_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(MAX_WORDS, 128, input_length=MAX_SEQ_LENGTH),
        tf.keras.layers.LSTM(64, return_sequences=False),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_cnn_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(MAX_WORDS, 128, input_length=MAX_SEQ_LENGTH),
        tf.keras.layers.Conv1D(128, 5, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_transformer_model():
    input_layer = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,))
    embedding = tf.keras.layers.Embedding(MAX_WORDS, 128)(input_layer)
    transformer_block = tf.keras.layers.MultiHeadAttention(
        num_heads=4, key_dim=128, dropout=0.1
    )(embedding, embedding)
    flatten = tf.keras.layers.GlobalAveragePooling1D()(transformer_block)
    dense = tf.keras.layers.Dense(64, activation='relu')(flatten)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


# 5. Training and Evaluation
models = {
    'LSTM': build_lstm_model(),
    '1D CNN': build_cnn_model(),
    'Transformer': build_transformer_model()
}

history = {}
aucs = {}
race_aucs = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    history[model_name] = model.fit(
        X_train, y_train, validation_data=(X_val, y_val),
        epochs=120, batch_size=32, verbose=1
    )

    # Compute Overall AUC
    y_pred = model.predict(X_test).ravel()
    aucs[model_name] = roc_auc_score(y_test, y_pred)
    print(f"Overall AUC for {model_name}: {aucs[model_name]}")

    # Compute AUC per racial race
    race_aucs[model_name] = {}
    for race in ['asian', 'black', 'white']:
        race_idx = np.where(race_test == race)
        race_auc = roc_auc_score(y_test[race_idx], y_pred[race_idx])
        race_aucs[model_name][race] = race_auc
        print(f"  {race} AUC for {model_name}: {race_auc}")

# 6. Visualization
for model_name, hist in history.items():
    plt.plot(hist.history['val_loss'], label=f'{model_name} Loss')
plt.legend()
plt.title('Validation Loss Comparison')
plt.show()

# 7. Summarize Results
print("Overall AUC Scores:", aucs)
print("Race AUC Scores:", race_aucs)


Mounted at /content/gdrive
Training Transformer...
Epoch 1/200
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m639s[0m 3s/step - accuracy: 0.5048 - loss: 0.6949 - val_accuracy: 0.5310 - val_loss: 0.7171
Epoch 2/200
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m674s[0m 3s/step - accuracy: 0.6887 - loss: 0.5794 - val_accuracy: 0.7760 - val_loss: 0.4592
Epoch 3/200
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m626s[0m 3s/step - accuracy: 0.8090 - loss: 0.4233 - val_accuracy: 0.7770 - val_loss: 0.4479
Epoch 4/200
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m687s[0m 3s/step - accuracy: 0.8426 - loss: 0.3607 - val_accuracy: 0.7650 - val_loss: 0.5163
Epoch 5/200
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m636s[0m 3s/step - accuracy: 0.8712 - loss: 0.2941 - val_accuracy: 0.7870 - val_loss: 0.5169
Epoch 6/200
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m679s[0m 3s/step - accuracy: 0.8893 - loss: 0.2534 - val_a