In [1]:
import os
import time

from absl import app
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import torch

import torch_utils  # local file import
import uncertainty_baselines as ub
import utils  # local file import
from tensorboard.plugins.hparams import api as hp

# Hide any GPUs form TensorFlow. Otherwise TF might reserve memory and make
# PyTorch crash with CUDA OoM error.
tf.config.experimental.set_visible_devices([], 'GPU')

DEFAULT_NUM_EPOCHS = 90

2022-10-02 12:32:05.109577: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-02 12:32:05.492932: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-02 12:32:05.492965: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-02 12:32:05.572791: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-02 12:32:06.756536: W tensorflow/stream_executor/platform/de

In [2]:
# Data load / output flags.

class FLAGS:
    output_dir = '/media/jishnu/Data/ub_retinopathy_outputs'
    data_dir = '/media/jishnu/Data/ub_retinopathy'
    use_validation = True
    base_learning_rate = 4e-4
    one_minus_momentum = 0.1
    lr_warmup_epochs = 20
    seed = 42
    l2 = 5e-5
    train_epochs = DEFAULT_NUM_EPOCHS
    train_batch_size = 16
    eval_batch_size = 32
    checkpoint_interval = 25
    dropout_rate = 0.1
    num_dropout_samples_eval = 10
    num_bins = 15
    use_gpu = True

In [3]:
tf.io.gfile.makedirs(FLAGS.output_dir)
logging.info('Saving checkpoints at %s', FLAGS.output_dir)

# Set seeds
tf.random.set_seed(FLAGS.seed)
np.random.seed(FLAGS.seed)
torch.manual_seed(FLAGS.seed)

# Resolve CUDA device(s)
if FLAGS.use_gpu and torch.cuda.is_available():
    print('Running model with CUDA.')
    device = 'cuda:0'
else:
    print('Running model on CPU.')
    device = 'cpu'

train_batch_size = FLAGS.train_batch_size
eval_batch_size = FLAGS.eval_batch_size // FLAGS.num_dropout_samples_eval


Running model with CUDA.


In [4]:
train_batch_size = FLAGS.train_batch_size
eval_batch_size = FLAGS.eval_batch_size // FLAGS.num_dropout_samples_eval

# As per the Kaggle challenge, we have split sizes:
# train: 35,126
# validation: 10,906
# test: 42,670
ds_info = tfds.builder('diabetic_retinopathy_detection').info
steps_per_epoch = ds_info.splits['train'].num_examples // train_batch_size
steps_per_validation_eval = (
  ds_info.splits['validation'].num_examples // eval_batch_size)
steps_per_test_eval = ds_info.splits['test'].num_examples // eval_batch_size

data_dir = FLAGS.data_dir

dataset_train_builder = ub.datasets.get('ub_diabetic_retinopathy_detection', split='train', data_dir=data_dir)
dataset_train = dataset_train_builder.load(batch_size=train_batch_size)

dataset_validation_builder = ub.datasets.get(
  'ub_diabetic_retinopathy_detection',
  split='validation',
  data_dir=data_dir,
  is_training=not FLAGS.use_validation)
validation_batch_size = (
  eval_batch_size if FLAGS.use_validation else train_batch_size)
dataset_validation = dataset_validation_builder.load(
  batch_size=validation_batch_size)
if not FLAGS.use_validation:
# Note that this will not create any mixed batches of train and validation
# images.
    dataset_train = dataset_train.concatenate(dataset_validation)

dataset_test_builder = ub.datasets.get(
  'ub_diabetic_retinopathy_detection', split='test', data_dir=data_dir)
dataset_test = dataset_test_builder.load(batch_size=eval_batch_size)


2022-10-02 12:32:22.541906: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".
2022-10-02 12:32:23.859763: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Building Kaggle DR dataset with decision threshold: moderate.


2022-10-02 12:32:24.151828: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2022-10-02 12:32:24.231985: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2022-10-02 12:32:24.495720: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2022-10-02 12:32:24.504562: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2022-10-02 12:32:24.573514: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2022-10-02 12:32:24.582766: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization


Building Kaggle DR dataset with decision threshold: moderate.
Building Kaggle DR dataset with decision threshold: moderate.


In [9]:
train_iterator = iter(dataset_train)
val_iterator = iter(dataset_validation)
test_iterator = iter(dataset_test)

In [10]:
# Check dataset stats

def get_torch_inputs(inputs, batch_size, image_h, image_w, device):
    images = inputs['features']
    labels = inputs['labels']
    images = torch.from_numpy(images._numpy()).view(batch_size, 3, image_h, image_w).to(device)
    labels = torch.from_numpy(labels._numpy()).to(device).float()
    return images, labels

# Train
image_h = 512
image_w = 512
for step in range(steps_per_epoch):
    inputs = next(train_iterator)
    images, labels = get_torch_inputs(inputs, train_batch_size, image_h, image_w, device)
    print (images.shape)
    print (labels.shape)
    print (images.min())
    print (images.max())
    print (images.unique())
    print (labels.unique())
    break
    
# Validation
for step in range(steps_per_validation_eval):
    inputs = next(val_iterator)
    images, labels = get_torch_inputs(inputs, eval_batch_size, image_h, image_w, device)
    print (images.shape)
    print (labels.shape)
    print (images.min())
    print (images.max())
    print (images.unique())
    print (labels.unique())
    break

# Test
for step in range(steps_per_test_eval):
    inputs = next(test_iterator)
    images, labels = get_torch_inputs(inputs, eval_batch_size, image_h, image_w, device)
    print (images.shape)
    print (labels.shape)
    print (images.min())
    print (images.max())
    print (images.unique())
    print (labels.unique())
    break


torch.Size([16, 3, 512, 512])
torch.Size([16])
tensor(0., device='cuda:0')
tensor(1., device='cuda:0')
tensor([0.0000e+00, 1.0771e-06, 1.8849e-06,  ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00], device='cuda:0')
tensor([0., 1.], device='cuda:0')
torch.Size([3, 3, 512, 512])
torch.Size([3])
tensor(0., device='cuda:0')
tensor(1., device='cuda:0')
tensor([0.0000e+00, 5.5651e-06, 3.3390e-05,  ..., 9.9991e-01, 9.9997e-01,
        1.0000e+00], device='cuda:0')
tensor([0., 1.], device='cuda:0')
torch.Size([3, 3, 512, 512])
torch.Size([3])
tensor(0., device='cuda:0')
tensor(1., device='cuda:0')
tensor([0.0000e+00, 2.5132e-05, 5.5769e-05,  ..., 9.9995e-01, 9.9997e-01,
        1.0000e+00], device='cuda:0')
tensor([0.], device='cuda:0')


In [13]:
from tqdm import tqdm

for step in tqdm(range(steps_per_epoch)):
    inputs = next(train_iterator)
    images, labels = get_torch_inputs(inputs, train_batch_size, image_h, image_w, device)
    # print (images.shape)
    # print (labels.shape)
    # print (images.min())
    # print (images.max())
    # print (images.unique())
    # print (labels.unique())
    pass
    
# Validation
for step in tqdm(range(steps_per_validation_eval)):
    inputs = next(val_iterator)
    images, labels = get_torch_inputs(inputs, eval_batch_size, image_h, image_w, device)
    # print (images.shape)
    # print (labels.shape)
    # print (images.min())
    # print (images.max())
    # print (images.unique())
    # print (labels.unique())
    pass

# Test
for step in tqdm(range(steps_per_test_eval)):
    inputs = next(test_iterator)
    images, labels = get_torch_inputs(inputs, eval_batch_size, image_h, image_w, device)
    # print (images.shape)
    # print (labels.shape)
    # print (images.min())
    # print (images.max())
    # print (images.unique())
    # print (labels.unique())
    pass

100%|████████████████████████████████████████████████| 2195/2195 [01:10<00:00, 31.05it/s]
100%|███████████████████████████████████████████████| 3635/3635 [00:12<00:00, 279.76it/s]
100%|█████████████████████████████████████████████| 14223/14223 [01:13<00:00, 194.67it/s]
