In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import RMSprop, Adam

In [7]:
labels_df = pd.read_csv('aptos-2015-training.csv')
image_paths = labels_df['image'].apply(lambda x: os.path.join('train_resized', x + '.jpg')).tolist()
labels = labels_df['level'].tolist()

In [8]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

labels = np.array(labels)  # Assuming 'labels' is your list/array of labels

# Find unique classes and their distributions
unique_classes = np.unique(labels)
class_distribution = [sum(labels == i) for i in unique_classes]

# Calculate class weights
weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=labels)
class_weights = {class_label: weight for class_label, weight in zip(unique_classes, weights)}

print("Class weights: ", class_weights)


Class weights:  {0: 0.27218907400232467, 1: 2.875644699140401, 2: 1.3275132275132275, 3: 8.047193585337915, 4: 9.922598870056497}


In [9]:
from sklearn.model_selection import train_test_split

# Assuming image_paths and labels are your full dataset
train_image_paths, val_image_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42, stratify=labels)

In [10]:
# Flip the retinal images horizontally and vertically and also lower and raise the brightness of the images
def preprocess_image(image_path, label):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    # If using ResNet50V2's preprocess_input, remove the manual rescaling above
    image = tf.keras.applications.resnet_v2.preprocess_input(image)
    return image, label

# Updated data augmentation sequence
data_augmentation = tf.keras.Sequential([
  layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
  layers.experimental.preprocessing.RandomZoom((-0.1, 0), width_factor=None, fill_mode='reflect'),
  layers.experimental.preprocessing.RandomContrast(0.2),  # Uncomment if contrast adjustment is desired
  #layers.experimental.preprocessing.Rescaling(1./255),
])

2024-03-03 01:53:24.457433: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-03 01:53:24.457944: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-03 01:53:24.458399: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [11]:
strategy = tf.distribute.MirroredStrategy()

BUFFER_SIZE = 1_000
BATCH_SIZE_PER_REPLICA = 100
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync


def create_dataset(image_paths, labels, BATCH_SIZE, is_training=True):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE,)
    
    if is_training:
        dataset = dataset.shuffle(buffer_size=BUFFER_SIZE, reshuffle_each_iteration=True) #seed ??
        dataset = dataset.map(lambda image, label: (data_augmentation(image, training=True), label),
                              num_parallel_calls=tf.data.AUTOTUNE)
    
    return dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4')


In [12]:
train_ds = create_dataset(train_image_paths, train_labels, BATCH_SIZE, is_training=True)
val_ds = create_dataset(val_image_paths, val_labels, BATCH_SIZE, is_training=False)


In [13]:
mirrored_strategy = tf.distribute.MirroredStrategy()
learning_rate = 0.001

with mirrored_strategy.scope():
    # Define the model inside the strategy scope
    base_model = tf.keras.applications.ResNet50V2(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
    #base_model.trainable = False
    base_model.trainable = True
    for layer in base_model.layers[:-50]:
        layer.trainable = False

    # Adding the custom layers on top of the base model
    model = tf.keras.Sequential([
        base_model,
        layers.Flatten(),
        layers.Dense(1024),
        layers.BatchNormalization(),  # Add Batch Normalization
        layers.Activation('relu'),
        layers.Dropout(0.5),
        # layers.Dense(512),
        # layers.BatchNormalization(),  # Add Batch Normalization
        # layers.Activation('relu'),
        # layers.Dropout(0.5),    
        layers.Dense(256),
        layers.BatchNormalization(),  # Add Batch Normalization
        layers.Activation('relu'),
        layers.Dense(5, activation='softmax')  # Output layer
    ])

    
    model.compile(optimizer=Adam(learning_rate=learning_rate),#'sgd'
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4')
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:te

In [14]:


callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30),
    tf.keras.callbacks.ModelCheckpoint('2_Distributed_Omicron_best_model', save_best_only=True);
]

In [15]:
EPOCHS = 30

model.fit(train_ds,
          epochs=EPOCHS,
          validation_data=val_ds,
          class_weight=class_weights,
          callbacks=callbacks)

Epoch 1/30
INFO:tensorflow:Collective all_reduce tensors: 56 all_reduces, num_devices = 5, group_size = 5, implementation = CommunicationImplementation.NCCL, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 56 all_reduces, num_devices = 5, group_size = 5, implementation = CommunicationImplementation.NCCL, num_packs = 1


2024-03-03 01:54:02.563702: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-03-03 01:54:02.600092: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-03-03 01:54:02.624195: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-03-03 01:54:02.644803: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-03-03 01:54:02.670858: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-03-03 01:54:18.919452: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f11a3123260 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-03-03 01:54:18.919485: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-03-03 01:54:18.919492: I external/local_xla/xla/service/service.



INFO:tensorflow:Assets written to: 2_Distributed_Omicron_best_model/assets


Epoch 2/30
Epoch 3/30


INFO:tensorflow:Assets written to: 2_Distributed_Omicron_best_model/assets


Epoch 4/30
Epoch 5/30
Epoch 6/30


INFO:tensorflow:Assets written to: 2_Distributed_Omicron_best_model/assets


Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30


INFO:tensorflow:Assets written to: 2_Distributed_Omicron_best_model/assets


Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30


INFO:tensorflow:Assets written to: 2_Distributed_Omicron_best_model/assets


Epoch 27/30
Epoch 28/30


INFO:tensorflow:Assets written to: 2_Distributed_Omicron_best_model/assets


Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f34f18d6a70>