*Importing libraries*

In [1]:
import numpy as np
import os
import glob
import tensorflow as tf
from tensorflow.keras.applications import ResNet50, EfficientNetB0
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight
from collections import Counter
from imblearn.over_sampling import SMOTE

*Data Ingestion*

In [2]:
#loading all .npz files from the directory
def load_data(directory):
    images, labels, genders = [], [], []
    npz_files= glob.glob(os.path.join(directory, '*.npz'))
    for file in npz_files:
        data= np.load(file)
        images.append(data['slo_fundus'])
        labels.append(data['dr_class'])
        genders.append(data['male'])
    return np.array(images), np.array(labels), np.array(genders)

In [3]:
#paths to the datasets
train_data_dir= "C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/train"
test_data_dir= "C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/test"
val_data_dir= "C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/val"

In [4]:
#loading the datasets
train_images, train_labels, train_genders= load_data(train_data_dir)
test_images, test_labels, test_genders= load_data(test_data_dir)
val_images, val_labels, val_genders= load_data(val_data_dir)

In [5]:
#checking the shape of the datasets
print("Shape of train images: ", train_images.shape)
print("Shape of test images: ", test_images.shape)
print("Shape of validation images: ", val_images.shape)

Shape of train images:  (4476, 200, 200, 3)
Shape of test images:  (1914, 200, 200, 3)
Shape of validation images:  (641, 200, 200, 3)


In [98]:
#normalizing images
train_images= train_images/255.0
test_images= test_images/255.0
val_images= val_images/255.0

In [99]:
#checking for class imbalance
data_dir= "C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/train"
#getting paths to the .npz files
npz_files= glob.glob(os.path.join(data_dir, '*.npz'))
#checking values for a few sample files
sample_files= npz_files[:5]
for file in sample_files:
    data= np.load(file)
    print(f"file:{file}")
    print("Diabetic Retinopathy Class ('dr_class') value:", data['dr_class'])
    print("Gender ('male') value:", data['male'])

file:C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/train\1005_right.npz
Diabetic Retinopathy Class ('dr_class') value: 0
Gender ('male') value: 0
file:C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/train\1006_left.npz
Diabetic Retinopathy Class ('dr_class') value: 0
Gender ('male') value: 1
file:C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/train\1006_right.npz
Diabetic Retinopathy Class ('dr_class') value: 0
Gender ('male') value: 1
file:C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1

In [100]:
#initializing counters for 'male' and 'dr_class'
gender_counter= Counter()
label_counter= Counter()

#looping through all .npz to count for occurences of values
for file in npz_files:
    data= np.load(file)
    gender_counter[int(data['male'])] +=1
    label_counter[int(data['dr_class'])] +=1

#print counts for genders and labels
print("Counts for 'male' (gender):", gender_counter)
print("Counts for 'dr_class' (label):", label_counter)

Counts for 'male' (gender): Counter({1: 2390, 0: 2086})
Counts for 'dr_class' (label): Counter({0: 3358, 1: 1118})


In [101]:
#calculating class weights to handle imbalance
class_weights= class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
class_weight_dict= {0: class_weights[0], 1: class_weights[1]}
print("Class weights:", class_weight_dict)

Class weights: {0: 0.6664681357951161, 1: 2.001788908765653}


**Modeling 1**

In [102]:
#model architecture (ResNet50)
def build_model():
    base_model= ResNet50(weights='imagenet', include_top=False, input_shape=(200, 200, 3))
    model= Sequential(
        [base_model,
        GlobalAveragePooling2D(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')]
    )
    return model

#compiling and training the model
model= build_model()
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['AUC'])

history= model.fit(
    train_images, train_labels, 
    validation_data=(val_images, val_labels),
    epochs=10, 
    batch_size=32, 
    class_weight=class_weight_dict
)
#evaluating the model
results= model.evaluate(test_images, test_labels)
print("Test AUC:", results[1])

Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m734s[0m 5s/step - AUC: 0.5354 - loss: 1.2067 - val_AUC: 0.4978 - val_loss: 158.3888
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m708s[0m 5s/step - AUC: 0.5009 - loss: 1.1780 - val_AUC: 0.5166 - val_loss: 0.7089
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 5s/step - AUC: 0.5210 - loss: 0.9815 - val_AUC: 0.5000 - val_loss: 0.7011
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m693s[0m 5s/step - AUC: 0.5177 - loss: 0.8693 - val_AUC: 0.5779 - val_loss: 1.2307
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m699s[0m 5s/step - AUC: 0.5043 - loss: 0.9096 - val_AUC: 0.5643 - val_loss: 0.6983
Epoch 6/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m697s[0m 5s/step - AUC: 0.5111 - loss: 0.9150 - val_AUC: 0.5000 - val_loss: 4022.0129
Epoch 7/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m688s[0

In [103]:
#calculating AUC using sklearn for comparison
test_predictions = model.predict(test_images).ravel()

#overall AUC score
overall_auc= roc_auc_score(test_labels, test_predictions)
print("Overall AUC:", overall_auc)

#separating AUC for Male and Female groups
female_indices= np.where(test_genders ==0)[0] #assuming 0= female, 1= male
male_indices= np.where(test_genders == 1)[0]

female_auc= roc_auc_score(test_labels[female_indices], test_predictions[female_indices])
male_auc= roc_auc_score(test_labels[male_indices], test_predictions[male_indices])

print("Female AUC:", female_auc)
print("Male AUC:", male_auc)

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 1s/step
Overall AUC: 0.4703142914469158
Female AUC: 0.4237088000218414
Male AUC: 0.5105547661251337


**Modeling 2**

In [7]:
#loading and preprocessing the data
def preprocess_data(images, labels):
    images= images/255.0
    return images, labels

#synthetic oversampling using SMOTE
def apply_smote(images, labels):
    #reshape images to 2D for SMOTE
    n_samples, height, width, channels= images.shape
    flat_images= images.reshape(n_samples, -1)

    #applying SMOTE
    smote= SMOTE(random_state=42)
    oversampled_images, oversampled_labels= smote.fit_resample(flat_images, labels)
    
    #reshaping images back to 3D
    oversampled_images= oversampled_images.reshape(-1, height, width, channels)
    return oversampled_images, oversampled_labels

#building EfficientNetB0 model
def build_efficientnet_model():
    base_model= EfficientNetB0(weights='imagenet', include_top=False, input_shape=(200, 200, 3))
    model= Sequential(
        [base_model,
        GlobalAveragePooling2D(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')]
    )
    return model

#loading the datasets
train_images, train_labels, train_genders = load_data("C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/train")
val_images, val_labels, val_genders = load_data("C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/val")
test_images, test_labels, test_genders = load_data("C:/Users/eutomi/Downloads/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos-20241202T032045Z-001/Problem_1_Diabetic_Retinopathy_Detection_using_Color_Fundus_Photos/ODIR_Data/test")

#preprocessing the data
train_images, train_labels= preprocess_data(train_images, train_labels)
val_images, val_labels= preprocess_data(val_images, val_labels)
test_images, test_labels= preprocess_data(test_images, test_labels)

#applying SMOTE to balance the training data
oversampled_images, oversampled_labels = apply_smote(train_images, train_labels)

#building and compiling the model
model= build_efficientnet_model()
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])

#training the model using the validation set
history= model.fit(
    oversampled_images, oversampled_labels,
    validation_data=(val_images, val_labels),
    epochs=10,
    batch_size=16
)

#evluating the model on the test set
results= model.evaluate(test_images, test_labels)
print("Test AUC:", results[1])

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step
Epoch 1/10
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m600s[0m 1s/step - AUC: 0.7210 - loss: 0.6201 - val_AUC: 0.5869 - val_loss: 1.4616
Epoch 2/10
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m542s[0m 1s/step - AUC: 0.8717 - loss: 0.4477 - val_AUC: 0.6961 - val_loss: 0.6140
Epoch 3/10
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m546s[0m 1s/step - AUC: 0.9086 - loss: 0.3796 - val_AUC: 0.7484 - val_loss: 0.6242
Epoch 4/10
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 1s/step - AUC: 0.9479 - loss: 0.2920 - val_AUC: 0.7285 - val_loss: 0.7764
Epoch 5/10
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m536s[0m 1s/step - AUC: 0.9690 - loss: 0.2253 - val_AUC: 0.6674 - val_loss: 0.9649
Epoch 6/10
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [8]:
#metrics for the model
test_predictions= model.predict(test_images).ravel()

#overall AUC score
overall_auc= roc_auc_score(test_labels, test_predictions)
print("Overall AUC:", overall_auc)

#gender-based AUC scores
female_indices= np.where(test_genders == 0)[0]
male_indices= np.where(test_genders == 1)[0]

female_auc= roc_auc_score(test_labels[female_indices], test_predictions[female_indices])
male_auc= roc_auc_score(test_labels[male_indices], test_predictions[male_indices])

print("Female AUC:", female_auc)
print("Male AUC:", male_auc)

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 482ms/step
Overall AUC: 0.7071679373996789
Female AUC: 0.6729051061012484
Male AUC: 0.7361747945446289
