In [None]:
# with LN it learns but is very slow, lets complete with BN and we will look into LN

%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import numpy as np
import warnings
from imgaug import augmenters as ia
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from keras.preprocessing import image
import keras.backend as K
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.engine.topology import get_source_inputs

from tensorflow.keras.datasets import cifar10
from datetime import datetime
from tensorflow.python.keras.utils.data_utils import Sequence
import random
!pip install --upgrade imgaug

In [None]:
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))
import imgaug
print (imgaug.__version__)

In [None]:
### class Augmentations:

class Augmentations:
    """ three data augmentation operators as mentioned to paper """
    def __init__(self, p = [0.4, 0.4, 0.2], severity=2):
        self.severity = severity
        self.p = p

    def get_opertors(self):
        operators = [self.random_crop, self.color_distortion, self.gauss_blur]
        ops = random.choices(operators, k=2, weights=self.p)
        return ops

    def random_crop(self, image):
        seq = ia.Sequential([
                ia.Crop(px=(0, 10)),
                ia.Sometimes(0.8, ia.Fliplr(0.5)),
                ia.Sometimes(0.8, ia.Flipud(0.5)),
                ia.Resize({'height': 32, 'width': 32})])

        images_aug = seq(images=image)
        return images_aug / 255.

    #ia.imgcorruptlike.Contrast(severity=self.severity)
    def color_distortion(self, image):
        image = image.astype(np.uint8)
        seq = ia.Sequential([ia.MultiplyBrightness((0.5, 1.5)),
                             ia.LinearContrast((0.4, 1.6)),
                             ia.MultiplySaturation((0.5, 1.5)),
                             ia.Sometimes(0.2, ia.Grayscale(alpha=(0.0, 1.0))),
                             ia.Resize({'height': 32, 'width': 32})])


        images_aug = seq(images=image)
        images_aug.astype(np.float32)
        return images_aug / 255.

    def gauss_blur(self, image):
        seq = ia.Sequential([ia.GaussianBlur(sigma=(0, 3.0)),
                            ia.Resize({'height': 32, 'width': 32})])
        #gb = ia.GaussianBlur(sigma=(0, 3.0))
        aug = seq(images=image)
        return aug / 255.

    def apply_operators(self, image):
        ops = self.get_opertors()
        #print ("### operators applied", ops)
        first_op = ops[0](image)
        second_op = ops[1](image)
        stack = np.stack((first_op, second_op), axis=1)
        reshape_stack = stack.reshape(stack.shape[0] * stack.shape[1], stack.shape[2], stack.shape[3], stack.shape[4])
        return reshape_stack


In [None]:
#### DATA GENERATOR PIPELINE

class CIFAR10Sequence(Sequence):
    def __init__(self, x_set, batch_size, shuffle=True, aug=False):
        self.x  = x_set
        self.batch_size = batch_size
        self.aug = aug
        self.indices = np.arange(self.x.shape[0])
        np.random.shuffle(self.indices)
        self.shuffle = shuffle
        self.augment = Augmentations()

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        #batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = self.x[inds]
        #batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        #batch_x = np.array(batch_x)
        #batch_y = np.array(batch_y)

        if self.aug:
            #print ("Applying augmentation")
            #auuu = self.augment.apply_operators(np.array(batch_x))
            #print ("aug total for the model", auuu.shape)
            return self.augment.apply_operators(np.array(batch_x))
        else:
            #return batch_x, batch_y
            return np.array(batch_x)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

In [None]:
# with 'he_normal'
def identity_block(input_tensor, kernel_size, filters, stage, block):
    """The identity block is the block that has no conv layer at shortcut.
    # Arguments
        input_tensor: input tensor
        kernel_size: default 3, the kernel size of
            middle conv layer at main path
        filters: list of integers, the filters of 3 conv layer at main path
        stage: integer, current stage label, used for generating layer names
        block: 'a','b'..., current block label, used for generating layer names
    # Returns
        Output tensor for the block.
    """
    filters1, filters2, filters3 = filters
    if K.image_data_format() == 'channels_last':
        bn_axis = 3
    else:
        bn_axis = 1
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    x = layers.Conv2D(filters1, (1, 1),
                      kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4),
                      name=conv_name_base + '2a')(input_tensor)
    x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
    x = layers.Activation('relu')(x)

    x = layers.Conv2D(filters2, kernel_size,
                      padding='same',
                      kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4),
                      name=conv_name_base + '2b')(x)
    x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
    x = layers.Activation('relu')(x)

    x = layers.Conv2D(filters3, (1, 1),
                      kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4),
                      name=conv_name_base + '2c')(x)
    x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)

    x = layers.add([x, input_tensor])
    x = layers.Activation('relu')(x)
    return x


In [None]:
def conv_block(input_tensor,
               kernel_size,
               filters,
               stage,
               block,
               strides=(2, 2)):
    """A block that has a conv layer at shortcut.
    # Arguments
        input_tensor: input tensor
        kernel_size: default 3, the kernel size of
            middle conv layer at main path
        filters: list of integers, the filters of 3 conv layer at main path
        stage: integer, current stage label, used for generating layer names
        block: 'a','b'..., current block label, used for generating layer names
        strides: Strides for the first conv layer in the block.
    # Returns
        Output tensor for the block.
    Note that from stage 3,
    the first conv layer at main path is with strides=(2, 2)
    And the shortcut should have strides=(2, 2) as well
    """
    filters1, filters2, filters3 = filters
    if K.image_data_format() == 'channels_last':
        bn_axis = 3
    else:
        bn_axis = 1
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    x = layers.Conv2D(filters1, (1, 1), strides=strides,
                      kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4),
                      name=conv_name_base + '2a')(input_tensor)
    x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
    x = layers.Activation('relu')(x)

    x = layers.Conv2D(filters2, kernel_size, padding='same',
                      kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4),
                      name=conv_name_base + '2b')(x)
    x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
    x = layers.Activation('relu')(x)

    x = layers.Conv2D(filters3, (1, 1),
                      kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4),
                      name=conv_name_base + '2c')(x)
    x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)

    shortcut = layers.Conv2D(filters3, (1, 1), strides=strides,
                             kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4),
                             name=conv_name_base + '1')(input_tensor)
    shortcut = layers.BatchNormalization(
        axis=bn_axis, name=bn_name_base + '1')(shortcut)

    x = layers.add([x, shortcut])
    x = layers.Activation('relu')(x)
    return x

In [None]:
def ResNet50(include_top=True, input_tensor=None, input_shape=(32, 32, 3), batch_size=10):

    if input_tensor is None:
        img_input = layers.Input(shape=input_shape, batch_size=batch_size)
    else:
        if not K.is_keras_tensor(input_tensor):
            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
        else:
            img_input = input_tensor
    if K.image_data_format() == 'channels_last':
        bn_axis = 3
    else:
        bn_axis = 1

    x = layers.ZeroPadding2D((3, 3))(img_input)
    x = layers.Conv2D(64, (3, 3), strides=(1, 1), kernel_initializer='he_normal', kernel_regularizer=l2(l=1e-4), name='conv1')(x)
    x = layers.BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
    x = layers.Activation('relu')(x)
    #x = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)

    x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
    x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')

    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')

    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')

    x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')

    x = layers.GlobalAveragePooling2D(name='global_avg_pool')(x)

    # non linear projection head
    #x = layers.Flatten()(x)
    #x = layers.BatchNormalization(axis=bn_axis, name='bn_flatten')(x)
    #x = layers.Dense(2048, activation='relu', name='fc2048')(x)
    #x = layers.BatchNormalization(axis=bn_axis, name='bn_fc2048')(x)
    x = layers.Dense(1024, activation='relu', name='fc1024')(x)
    x = layers.Dense(128, name='fc128')(x)

    # Ensure that the model takes into account
    # any potential predecessors of `input_tensor`.
    if input_tensor is not None:
        inputs = get_source_inputs(input_tensor)
    else:
        inputs = img_input
    # Create model.

    model = Model(inputs=inputs, outputs=x, name='resnet50_he')
    return model

In [None]:
from tensorflow.keras import backend as K

## CUSTOM LOSS
def test_custom_cosine_an(x, tau=0.5, epsilon=1e-8):
    # get cosine similarity between between each pair
    def cosine_score(x):
        vec_norm = tf.norm(x, axis=1, keepdims=True)
        magnitude = tf.multiply(vec_norm, tf.transpose(vec_norm))
        magnitude_clipped = tf.clip_by_value(magnitude, clip_value_min=0.000001, clip_value_max=10000)
        dot_product = tf.matmul(x, x, transpose_b=True)
        # add tau value later
        #scale_temp_magnitude = magnitude * tau
        cosine_theta = dot_product / magnitude_clipped
        return cosine_theta

    def create_indices(x):
        # incrementing ids
        ids = K.shape(x)[0]
        range_output = layers.Lambda(lambda x: tf.range(x))(ids)
        incre_ids = layers.Lambda(lambda x: tf.range(0, x.shape[0], 2))(range_output)
        reshape_incre_ids = layers.Lambda(lambda x: tf.reshape(x, [-1, 1]))(incre_ids)
        increment_output = layers.Lambda(lambda x: tf.add(x[::2], 1))(range_output)
        # scatter and update
        updated_w_increment = tf.tensor_scatter_nd_update(range_output, reshape_incre_ids, increment_output)

        # decrement ids
        decre_ids = layers.Lambda(lambda x: tf.range(1, x.shape[0], 2))(range_output)
        reshape_decre_ids = layers.Lambda(lambda x: tf.reshape(x, [-1, 1]))(decre_ids)
        decrement_output = layers.Lambda(lambda x: tf.subtract(x[1::2], 1))(updated_w_increment)
        updated_w_decrement = tf.tensor_scatter_nd_update(updated_w_increment, reshape_decre_ids, decrement_output)
        return updated_w_decrement

    cosine_theta_scores = cosine_score(x)
    # temperature scaling  exp(score/ tau)
    exp_cosine = tf.exp( cosine_theta_scores / tau)
    # getting all positive pairs
    ids = create_indices(exp_cosine)
    exp_cosine_diag = tf.gather(exp_cosine, ids)
    deno = tf.reduce_sum(exp_cosine_diag, axis=0) - tf.exp(1 / tau)
    loss = tf.linalg.diag_part(exp_cosine_diag) / deno
    log_loss = -tf.math.log(tf.reduce_mean(loss))
    #print ("log loss", log_loss)
    return log_loss


In [None]:
## CUSTOM LOSS
def test_custom_cosine(x, tau=0.5, epsilon=1e-8):
    # get cosine similarity between between each pair

    def cosine_score(x):
        vec_norm = tf.norm(x, axis=1, keepdims=True)
        magnitude = tf.multiply(vec_norm, tf.transpose(vec_norm))
        magnitude_clipped = tf.clip_by_value(magnitude, clip_value_min=0.000001, clip_value_max=10000)
        dot_product = tf.matmul(x, x, transpose_b=True)
        # add tau value later
        #scale_temp_magnitude = magnitude * tau
        cosine_theta = dot_product / magnitude_clipped
        return cosine_theta

    cosine_theta_scores = cosine_score(x)
    # temperature scaling  exp(score/ tau)
    exp_cosine = tf.exp( cosine_theta_scores / tau)
    # getting all positive pairs
    ids = np.arange(exp_cosine.shape[0])
    ids[::2] += 1
    ids[1::2] -= 1
    exp_cosine_diag = tf.gather(exp_cosine, ids)
    print ("all pairs on diagonal", exp_cosine_diag)
    deno = tf.reduce_sum(exp_cosine_diag, axis=0) - tf.exp(1 / tau)
    loss = tf.linalg.diag_part(exp_cosine_diag) / deno
    log_loss = -tf.math.log(tf.reduce_mean(loss))
    print ("log loss", log_loss)
    return log_loss


In [None]:
import os
from datetime import datetime

log_dir= "/content/gdrive/My Drive/logs_he_regu/"
checkpoint_path = "/content/gdrive/My Drive/models_he_regu/model_{epoch}.ckpt"

In [None]:
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
      1e-5,
      decay_steps=500 * 600,
      decay_rate=1,
      staircase=False)

batch_size = 200

# reload pretrained weights
reload_checkpoint_path = '/content/gdrive/My Drive/models_he_regu/'
latest = tf.train.latest_checkpoint(reload_checkpoint_path)
print (latest)

model = ResNet50(batch_size=batch_size)
model.load_weights(latest)

output = model.layers[-1].output

# add loss function
myloss = test_custom_cosine(output)
model.add_loss(myloss)

# compile model
optimizer = tf.keras.optimizers.Adam(lr_schedule)
model.compile(optimizer=optimizer)
#tf.keras.utils.plot_model(model, "/content/gdrive/My Drive/resnet_he.png", show_shapes=True)
#new_model.summary()


In [None]:
from sklearn.model_selection import train_test_split

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print ("### Found train samples {} test samples {}".format(x_train.shape[0], x_test.shape[0]))

train_gen = CIFAR10Sequence(x_train, batch_size=100, shuffle=True, aug=True)

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=3)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        save_best_only=True,
        save_weights_only=True,
        monitor='loss',
        mode='min',
        verbose=1)]
callbacks.append(tensorboard_callback)

#training_history = model.fit(train_datagen.flow(X_train, Y_train, batch_size=128, shuffle=True), epochs=300, validation_data=valid_dataset, callbacks=callbacks)
training_history = model.fit(train_gen, initial_epoch=1228, epochs=2000, callbacks=callbacks)

In [None]:
import matplotlib.pyplot as plt
def display_training_curves(training, title, subplot):
  ax = plt.subplot(subplot)
  ax.plot(training)
  #ax.plot(validation)
  ax.set_title('model '+ title)
  ax.set_ylabel(title)
  ax.set_xlabel('epoch')
  ax.legend(['training'])

plt.subplots(figsize=(10,10))
plt.tight_layout()
display_training_curves(training_history.history['loss'], 'loss', 211)

In [None]:
checkpoint_path = '/content/gdrive/My Drive/models/'
latest = tf.train.latest_checkpoint(checkpoint_path)
print (latest)

new_model = ResNet50(batch_size=batch_size)
new_model.load_weights(latest)