## Training on TPU

### Install packages

In [2]:
%%capture
!pip install -U efficientnet

### Imports

In [3]:
from pathlib import Path
from functools import partial

import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np
import wandb
from wandb.keras import WandbCallback
from sklearn import model_selection

import efficientnet.tfkeras as efn
from kaggle_datasets import KaggleDatasets

## WandB Login

In [4]:
!wandb login f137298421da563b24639d1287dd3ce5da537814

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


In [5]:
notes = "1 out of 16 tfrec files used for validation set"
wandb.init(project="kaggle-melanoma", notes=notes)

[34m[1mwandb[0m: Wandb version 0.8.36 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


W&B Run: https://app.wandb.ai/nisarahamedk/kaggle-melanoma/runs/319xv0vh

## TPU Config

Detect hardware and return appropriate distribution strategy

In [6]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # no parameter needed for TPU_NAME env variable is set. This is the case for Kaggle
    print("Running on TPU: ", tpu.master())
except ValueError:
    tpu = None

Running on TPU:  grpc://10.0.0.2:8470


In [7]:
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # default strategy with the available hw
    strategy = tf.distribute.get_strategy()
    
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  8


## Dataset from GCS for TPU

In [8]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path("siim-isic-melanoma-classification")

In [9]:
GCS_DS_PATH

'gs://kds-0b2c68d2b2fa4692fcffc1029c606b32dd6a88de8d6da08fcd30d0c4'

In [None]:
!gsutil ls $GCS_DS_PATH # list the GCS bucket

In [10]:
train_files = tf.io.gfile.glob(GCS_DS_PATH + "/tfrecords/train*")
test_files = tf.io.gfile.glob(GCS_DS_PATH + "/tfrecords/test*")

In [11]:
len(train_files), len(test_files)

(16, 16)

### Train, Valid split

In [None]:
LOCAL_DS_PATH = Path("/kaggle/input/siim-isic-melanoma-classification")

In [None]:
train_df = pd.read_csv(LOCAL_DS_PATH / "train.csv")
train_df.head()

Assign a fold id for each images using StratifiedKFold

In [None]:
train_df["kfold"] = -1

y = train_df["target"].values

skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True)

for fold, (train_idx, test_idx) in enumerate(skf.split(X=train_df, y=y)):
    train_df.loc[test_idx, "kfold"] = fold
    
train_df.head()

This way, when we run training with fold=1, images with column "kfold=1" will be kept in validation set, others in training set.

In [None]:
train_df["kfold"].value_counts()

### tf.Dataset pipeline 

Building the complete tfrecord -> model data pipeline

In [None]:
train_dataset = tf.data.TFRecordDataset(train_files, num_parallel_reads=tf.data.experimental.AUTOTUNE)
test_dataset = tf.data.TFRecordDataset(test_files, num_parallel_reads=tf.data.experimental.AUTOTUNE)

* Checking the feature discription of the tfreceord files

We have features: "image", "image_name" and "target"

In [None]:
# test set
for item in test_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(item.numpy())
    print(example)

We have "image" and "image_name" for the test dataset

#### Feature description


In [12]:
train_feature_desc = {
    "image": tf.io.FixedLenFeature([], tf.string),
    "image_name": tf.io.FixedLenFeature([], tf.string), # for filtering images belong to val set.
    "target": tf.io.FixedLenFeature([], tf.int64),
}

test_feature_desc = {
    "image": tf.io.FixedLenFeature([], tf.string),
    "image_name": tf.io.FixedLenFeature([], tf.string),
}

Using the above feature description, 
* Lets load the dataset from the tfrecord files
* Process it using the feature description.
* Decode each sample into an image.
* return image, target pairs  

*Read from bottom to top*

In [24]:
TPU_IMAGE_SIZE = 1024
INPUT_IMAGE_SIZE = 512

In [25]:
fold = train_df[["image_name", "kfold"]].set_index("image_name").to_dict()
fold = fold["kfold"]
fold

NameError: name 'train_df' is not defined

In [26]:
def decode_image(image_data):
    image_size = [TPU_IMAGE_SIZE, TPU_IMAGE_SIZE]
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*image_size, 3])  # explicit size needed for TPU
    image = tf.image.resize(image, [INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE])
    return image
                           
def parse_test_example(example):
    """ function to parse each example read from the test tfrecord file"""
    example = tf.io.parse_single_example(example, test_feature_desc)
    return example

def parse_train_example(example):
    """ function to parse each example read from the train tfrecord file"""
    example = tf.io.parse_single_example(example, train_feature_desc)
    return example

def process_train_example(example):
    image = decode_image(example["image"])
    label = tf.cast(example["target"], tf.int32)
    return image, label

def process_test_example(example):
    image = decode_image(example["image"])
    image_name = example["image_name"]
    return image, image_name

def train_filter_fn(example):
    # convert folds dict to tensorflow lookup table.
    img_names = tf.constant(list(fold.keys()))
    fold_idx = tf.constant(list(fold.values()))
    folds_init = tf.lookup.KeyValueTensorInitializer(img_names, fold_idx)
    folds_table = tf.lookup.StaticHashTable(folds_init, -1)
    return folds_table.lookup(example["image_name"]) != 1
    
def valid_filter_fn(example):
    # convert folds dict to tensorflow lookup table.
    img_names = tf.constant(list(fold.keys()))
    fold_idx = tf.constant(list(fold.values()))
    folds_init = tf.lookup.KeyValueTensorInitializer(img_names, fold_idx)
    folds_table = tf.lookup.StaticHashTable(folds_init, -1)
    return folds_table.lookup(example["image_name"]) == 1
    
def load_dataset_from_tfrecord(filenames, ds_type="train", ordered=False):
    
    # Since we are reading dataset from multiple files. and we dont care about the order.
    # set deterministic reading to False.
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=tf.data.experimental.AUTOTUNE)
    dataset.with_options(ignore_order)
    
    # parse each example with feature description
    if ds_type in ["train", "valid"]:
        dataset = dataset.map(parse_train_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        # filter  
        # dataset = dataset.filter(train_filter_fn if ds_type=="train" else valid_filter_fn)
        dataset = dataset.map(process_train_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    else:
        dataset = dataset.map(parse_test_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.map(process_test_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)

    return dataset
    

Data augmentation for the image

In [27]:
def data_augment(image, label):
    # https://www.wouterbulten.nl/blog/tech/data-augmentation-using-tensorflow-data-dataset/
    image = tf.image.random_flip_left_right(image)
    return image, label

Finally, the datapipeline function that puts these all together

In [28]:
batch_size = 16 * strategy.num_replicas_in_sync
def get_training_dataset():
    dataset = load_dataset_from_tfrecord(train_files[1:], ds_type="train")
    dataset = dataset.map(data_augment, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset

In [29]:
def get_validation_dataset():
    dataset = load_dataset_from_tfrecord([train_files[0]], ds_type="valid")
    dataset = dataset.map(data_augment, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset

In [30]:
def get_test_dataset(ordered=False):
    dataset = load_dataset_from_tfrecord(test_files, ds_type="test", ordered=ordered)
    dataset = dataset.batch(batch_size)
    dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

Sanity check

In [31]:
print("*** Training set shapes *****")
for image, label in get_training_dataset().take(3):
    print(image.numpy().shape, label.numpy().shape)
    
print("*** Training set labels: ", label.numpy())

print("*** Validation set shapes *****")
for image, label in get_validation_dataset().take(3):
    print(image.numpy().shape, label.numpy().shape)
    
print("*** Validation set labels: ", label.numpy())


print("*** Test set shape ***")
for image, image_name in get_test_dataset().take(3):
    print(image.numpy().shape, image_name.numpy().shape)
print("*** Test set image_name: ", image_name.numpy().astype("U"))

*** Training set shapes *****
(128, 512, 512, 3) (128,)
(128, 512, 512, 3) (128,)
(128, 512, 512, 3) (128,)
*** Training set labels:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
*** Validation set shapes *****
(128, 512, 512, 3) (128,)
(128, 512, 512, 3) (128,)
(128, 512, 512, 3) (128,)
*** Validation set labels:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
*** Test set shape ***
(128, 512, 512, 3) (128,)
(128, 512, 512, 3) (128,)
(128, 512, 512, 3) (128,)
*** Test set image_name:  ['ISIC_3009035' 'ISIC_1579773' 'ISIC_6082685' 'ISIC_1263999'
 'ISIC_4348477' 'I

## Model - Efficient Net 

In [32]:
with strategy.scope():
    model = keras.Sequential([
        efn.EfficientNetB7(
            input_shape=[INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3],
            weights="imagenet",
            include_top=False,
        ),
        keras.layers.GlobalAveragePooling2D(),
        keras.layers.Dense(1024, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(512, activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(1, activation="sigmoid"),
    ])

In [33]:
model.compile(
    optimizer="adam", 
    # loss="binary_crossentropy", 
    loss=tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.1), 
    metrics=["accuracy", keras.metrics.AUC()]
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnet-b7 (Model)      (None, 16, 16, 2560)      64097680  
_________________________________________________________________
global_average_pooling2d_1 ( (None, 2560)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1024)              2622464   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)              

## Training

In [None]:
history = model.fit(get_training_dataset(), validation_data=get_validation_dataset(), epochs=10, callbacks=[WandbCallback()])

Epoch 1/10


## Submission

In [None]:
test_ds = get_test_dataset(ordered=True)

print("Computing predictions...")
test_image_ds = test_ds.map(lambda image, image_name: image)
probs = model.predict(test_image_ds).flatten()
print(probs)

In [None]:
import re
def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
print("Generating submission file")
num_test_images = count_data_items(test_files)
test_image_names_ds = test_ds.map(lambda image, image_name: image_name).unbatch()

test_image_names = next(iter(test_image_names_ds.batch(num_test_images))).numpy().astype('U') # all in one batch
np.savetxt('submission.csv', np.rec.fromarrays([test_image_names, probs]), fmt=['%s', '%f'], delimiter=',', header='image_name,target', comments='')
!head submission.csv