In [1]:
# Import packages
from __future__ import print_function
import keras
from keras.datasets import cifar100
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os
import matplotlib.pyplot as plt
%matplotlib inline

import keras.backend as K
import tensorflow as tf

import json
import time


## Define custom loss functions and metrics

In [2]:
def superclass1_loss(y_true,y_pred):
    # This is the cross entropy loss at the superclass level (20 superclasses)
    y_true = tf.clip_by_value(y_true, 1e-16, 1-(1e-16))
    y_pred = tf.clip_by_value(y_pred, 1e-16, 1-(1e-16))
    part1 = [[4, 30, 55, 72, 95],[1, 32, 67, 73, 91],[54, 62, 70, 82, 92],
             [9, 10, 16, 28, 61],[0, 51, 53, 57, 83],[22, 39, 40, 86, 87],
             [5, 20, 25, 84, 94],[6, 7, 14, 18, 24], [3, 42, 43, 88, 97],
             [12, 17, 37, 68, 76], [23, 33, 49, 60, 71], [15, 19, 21, 31, 38],
             [34, 63, 64, 66, 75], [26, 45, 77, 79, 99], [2, 11, 35, 46, 98],
             [27, 29, 44, 78, 93], [36, 50, 65, 74, 80], [47, 52, 56, 59, 96],
             [8, 13, 48, 58, 90], [41, 69, 81, 85, 89]]
    loss1=0
    for sublist in part1:
        ytt_temp = K.sum(tf.gather(y_true,sublist,axis=1),axis=-1)
        ypt_temp = K.sum(tf.gather(y_pred,sublist,axis=1),axis=-1)
        loss1+=ytt_temp*(-1)*K.log(ypt_temp)
    return loss1

In [3]:
def superclass2_loss(y_true,y_pred):
    # This is the cross entropy loss at the level 1 up from superclass (8 choices)
    # 4 animals (human, mammal, water, other)
    # plant, landscape, and 2 objects (vehicle, other)
    y_true = tf.clip_by_value(y_true, 1e-16, 1-(1e-16))
    y_pred = tf.clip_by_value(y_pred, 1e-16, 1-(1e-16))
    part2 = [[4, 30, 55, 72, 95, 1, 32, 67, 73, 91],
             [54, 62, 70, 82, 92, 0, 51, 53, 57, 83, 47, 52, 56, 59, 96],
             [6, 7, 14, 18, 24, 26, 45, 77, 79, 99, 27, 29, 44, 78, 93],
             [3, 42, 43, 88, 97, 15, 19, 21, 31, 38, 34, 63, 64, 66, 75, 36, 50, 65, 74, 80],
             [2, 11, 35, 46, 98],
             [23, 33, 49, 60, 71, 12, 17, 37, 68, 76],
             [9, 10, 16, 28, 61, 22, 39, 40, 86, 87, 5, 20, 25, 84, 94],
             [8, 13, 48, 58, 90, 41, 69, 81, 85, 89]]

    loss2=0
    for sublist in part2:
        ytt_temp = K.sum(tf.gather(y_true,sublist,axis=1),axis=-1)
        ypt_temp = K.sum(tf.gather(y_pred,sublist,axis=1),axis=-1)
        loss2+=ytt_temp*(-1)*K.log(ypt_temp)
    return loss2

In [4]:
def superclass3_loss(y_true,y_pred):
    # This is the cross entropy loss at the level 2 up from superclass (4 choices)
    # animal, plant, landscape, object (manmade)
    y_true = tf.clip_by_value(y_true, 1e-16, 1-(1e-16))
    y_pred = tf.clip_by_value(y_pred, 1e-16, 1-(1e-16))
    part3 = [[4, 30, 55, 72, 95, 1, 32, 67, 73, 91, 6, 7, 14, 18, 24, 26, 45, 77, 79,
              99, 27, 29, 44, 78, 93, 3, 42, 43, 88, 97, 15, 19, 21, 31, 38, 34, 63, 64,
              66, 75, 36, 50, 65, 74, 80, 2, 11, 35, 46, 98],
             [54, 62, 70, 82, 92, 0, 51, 53, 57, 83, 47, 52, 56, 59, 96],
             [23, 33, 49, 60, 71, 12, 17, 37, 68, 76],
             [9, 10, 16, 28, 61, 22, 39, 40, 86, 87, 5, 20, 25, 84, 94, 8, 13, 48, 58,
              90, 41, 69, 81, 85, 89]]
    loss3=0
    for sublist in part3:
        ytt_temp = K.sum(tf.gather(y_true,sublist,axis=1),axis=-1)
        ypt_temp = K.sum(tf.gather(y_pred,sublist,axis=1),axis=-1)
        loss3+=ytt_temp*(-1)*K.log(ypt_temp)
    return loss3

In [5]:
def struct1_loss(y_true,y_pred):
    # This is an equal blend of superclass_loss and "normal" (class-level) loss
    y_true = tf.clip_by_value(y_true, 1e-16, 1-(1e-16))
    y_pred = tf.clip_by_value(y_pred, 1e-16, 1-(1e-16))
    loss0 = K.sum(y_true*(-1)*K.log(y_pred), axis=-1)
    loss1 = superclass1_loss(y_true, y_pred)
    return .5*loss0+.5*loss1

In [6]:
def struct2_loss(y_true,y_pred):
    # This is an equal blend of superclass2_loss, superclass1_loss 
    # and "normal" (class-level) loss
    y_true = tf.clip_by_value(y_true, 1e-16, 1-(1e-16))
    y_pred = tf.clip_by_value(y_pred, 1e-16, 1-(1e-16))
    loss0 = K.sum(y_true*(-1)*K.log(y_pred), axis=-1)
    loss1 = superclass1_loss(y_true, y_pred)
    loss2 = superclass2_loss(y_true, y_pred)
    return (1/3)*loss0+(1/3)*loss1+(1/3)*loss2

In [7]:
def struct3_loss(y_true,y_pred):
    # This is an equal blend of superclass3_loss,superclass2_loss, 
    # superclass1_loss and "normal" (class-level) loss
    y_true = tf.clip_by_value(y_true, 1e-16, 1-(1e-16))
    y_pred = tf.clip_by_value(y_pred, 1e-16, 1-(1e-16))
    loss0 = K.sum(y_true*(-1)*K.log(y_pred), axis=-1)
    loss1 = superclass1_loss(y_true, y_pred)
    loss2 = superclass2_loss(y_true, y_pred)
    loss3 = superclass3_loss(y_true, y_pred)
    return .25*loss0+.25*loss1+.25*loss2+.25*loss3

In [8]:
def super1_acc(y_true, y_pred):
    table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant([4, 30, 55, 72, 95, 1, 32, 67, 73, 91, 54, 62, 70, 82, 92, 9, 10, 16, 28, 61, 0, 51, 53, 57, 83, 
                          22, 39, 40, 86, 87, 5, 20, 25, 84, 94, 6, 7, 14, 18, 24, 3, 42, 43, 88, 97, 12, 17, 37, 68, 76, 
                          23, 33, 49, 60, 71, 15, 19, 21, 31, 38, 34, 63, 64, 66, 75, 26, 45, 77, 79, 99, 2, 11, 35, 46, 98,
                          27, 29, 44, 78, 93, 36, 50, 65, 74, 80, 47, 52, 56, 59, 96, 8, 13, 48, 58, 90, 41, 69, 81, 85, 89]),
        values=tf.constant([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 
                            7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 
                            13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
                            15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19]),
    ),
    default_value=tf.constant(-1),
    name="superclass_index"
    )
    yt_hard = tf.argmax(y_true, axis=1)
    yp_hard = tf.argmax(y_pred, axis=1)
    
    ytc = table.lookup(tf.cast(yt_hard, tf.int32))
    ypc = table.lookup(tf.cast(yp_hard, tf.int32))
    return K.sum(tf.cast(tf.equal(ytc,ypc),tf.int32))/tf.shape(ytc)

In [9]:
def super2_acc(y_true, y_pred):
    table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant([4, 30, 55, 72, 95, 1, 32, 67, 73, 91, 54, 62, 70, 82, 92, 9, 10, 16, 28, 61, 0, 51, 53, 57, 83, 
                          22, 39, 40, 86, 87, 5, 20, 25, 84, 94, 6, 7, 14, 18, 24, 3, 42, 43, 88, 97, 12, 17, 37, 68, 76, 
                          23, 33, 49, 60, 71, 15, 19, 21, 31, 38, 34, 63, 64, 66, 75, 26, 45, 77, 79, 99, 2, 11, 35, 46, 98,
                          27, 29, 44, 78, 93, 36, 50, 65, 74, 80, 47, 52, 56, 59, 96, 8, 13, 48, 58, 90, 41, 69, 81, 85, 89]),
        values=tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
                            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
                            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]),
    ),
    default_value=tf.constant(-1),
    name="superclass2_index"
    )
    yt_hard = tf.argmax(y_true, axis=1)
    yp_hard = tf.argmax(y_pred, axis=1)
    
    ytc = table.lookup(tf.cast(yt_hard, tf.int32))
    ypc = table.lookup(tf.cast(yp_hard, tf.int32))
    return K.sum(tf.cast(tf.equal(ytc,ypc),tf.int32))/tf.shape(ytc)

In [10]:
def super3_acc(y_true, y_pred):
    table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant([4, 30, 55, 72, 95, 1, 32, 67, 73, 91, 54, 62, 70, 82, 92, 9, 10, 16, 28, 61, 0, 51, 53, 57, 83, 
                          22, 39, 40, 86, 87, 5, 20, 25, 84, 94, 6, 7, 14, 18, 24, 3, 42, 43, 88, 97, 12, 17, 37, 68, 76, 
                          23, 33, 49, 60, 71, 15, 19, 21, 31, 38, 34, 63, 64, 66, 75, 26, 45, 77, 79, 99, 2, 11, 35, 46, 98,
                          27, 29, 44, 78, 93, 36, 50, 65, 74, 80, 47, 52, 56, 59, 96, 8, 13, 48, 58, 90, 41, 69, 81, 85, 89]),
        values=tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
                            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]),
    ),
    default_value=tf.constant(-1),
    name="superclass2_index"
    )
    yt_hard = tf.argmax(y_true, axis=1)
    yp_hard = tf.argmax(y_pred, axis=1)
    
    ytc = table.lookup(tf.cast(yt_hard, tf.int32))
    ypc = table.lookup(tf.cast(yp_hard, tf.int32))
    return K.sum(tf.cast(tf.equal(ytc,ypc),tf.int32))/tf.shape(ytc)

## Begin Training loop

In [11]:
import numpy as np
np.random.seed(0)
random_order = np.random.choice(size=100, a=np.arange(100), replace=False)
random_order

array([26, 86,  2, 55, 75, 93, 16, 73, 54, 95, 53, 92, 78, 13,  7, 30, 22,
       24, 33,  8, 43, 62,  3, 71, 45, 48,  6, 99, 82, 76, 60, 80, 90, 68,
       51, 27, 18, 56, 63, 74,  1, 61, 42, 41,  4, 15, 17, 40, 38,  5, 91,
       59,  0, 34, 28, 50, 11, 35, 23, 52, 10, 31, 66, 57, 79, 85, 32, 84,
       14, 89, 19, 29, 49, 97, 98, 69, 20, 94, 72, 77, 25, 37, 81, 46, 39,
       65, 58, 12, 88, 70, 87, 36, 21, 83,  9, 96, 67, 64, 47, 44])

In [12]:
scramble_dict = {i:random_order[i] for i in range(100)}
scramble_dict

{0: 26,
 1: 86,
 2: 2,
 3: 55,
 4: 75,
 5: 93,
 6: 16,
 7: 73,
 8: 54,
 9: 95,
 10: 53,
 11: 92,
 12: 78,
 13: 13,
 14: 7,
 15: 30,
 16: 22,
 17: 24,
 18: 33,
 19: 8,
 20: 43,
 21: 62,
 22: 3,
 23: 71,
 24: 45,
 25: 48,
 26: 6,
 27: 99,
 28: 82,
 29: 76,
 30: 60,
 31: 80,
 32: 90,
 33: 68,
 34: 51,
 35: 27,
 36: 18,
 37: 56,
 38: 63,
 39: 74,
 40: 1,
 41: 61,
 42: 42,
 43: 41,
 44: 4,
 45: 15,
 46: 17,
 47: 40,
 48: 38,
 49: 5,
 50: 91,
 51: 59,
 52: 0,
 53: 34,
 54: 28,
 55: 50,
 56: 11,
 57: 35,
 58: 23,
 59: 52,
 60: 10,
 61: 31,
 62: 66,
 63: 57,
 64: 79,
 65: 85,
 66: 32,
 67: 84,
 68: 14,
 69: 89,
 70: 19,
 71: 29,
 72: 49,
 73: 97,
 74: 98,
 75: 69,
 76: 20,
 77: 94,
 78: 72,
 79: 77,
 80: 25,
 81: 37,
 82: 81,
 83: 46,
 84: 39,
 85: 65,
 86: 58,
 87: 12,
 88: 88,
 89: 70,
 90: 87,
 91: 36,
 92: 21,
 93: 83,
 94: 9,
 95: 96,
 96: 67,
 97: 64,
 98: 47,
 99: 44}

In [13]:
# Set some parameters
batch_size = 32
num_classes = 100
epochs = 1000
data_augmentation = True
train_set_size = 5000
num_trials = 10
loss_fn_to_use = struct3_loss
lr = 0.00005
timestamp = int(time.time())

In [14]:

for i in range(1,num_trials):
    # The data, split between train and test sets:
    (x_train_full, y_train_full), (x_test, y_test_simple) = cifar100.load_data()
    print('x_train shape:', x_train_full.shape)
    
    x_train = x_train_full[i*train_set_size:(i+1)*train_set_size,:,:,:]
    y_train_simple = y_train_full[i*train_set_size:(i+1)*train_set_size,:]
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    y_train_simple_scrambled = np.array([scramble_dict[i] for i in y_train_simple.reshape(-1)]
                                       ).reshape(-1,1)
    y_test_simple_scrambled = np.array([scramble_dict[i] for i in y_test_simple.reshape(-1)]
                                       ).reshape(-1,1)
    # Convert class vectors to binary class matrices.
    y_train = keras.utils.to_categorical(y_train_simple_scrambled, num_classes)
    y_test = keras.utils.to_categorical(y_test_simple_scrambled, num_classes)

    model = 0
    # Define model structure
    model = Sequential()
    model.add(Conv2D(32, (3, 3), padding='same',
                     input_shape=x_train.shape[1:]))
    model.add(Activation('relu'))
    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(64, (3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    # initiate RMSprop optimizer
    opt = keras.optimizers.RMSprop(learning_rate=lr, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss=loss_fn_to_use,
                  optimizer=opt,
                  metrics=['accuracy',super1_acc,super2_acc, super3_acc,
                           'categorical_crossentropy',superclass1_loss, superclass2_loss,superclass3_loss,
                           struct1_loss, struct2_loss, struct3_loss])

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    if not data_augmentation:
        print('Not using data augmentation.')
        history=model.fit(x_train, y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            zca_epsilon=1e-06,  # epsilon for ZCA whitening
            rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            shear_range=0.,  # set range for random shear
            zoom_range=0.,  # set range for random zoom
            channel_shift_range=0.,  # set range for random channel shifts
            # set mode for filling points outside the input boundaries
            fill_mode='nearest',
            cval=0.,  # value used for fill_mode = "constant"
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False,  # randomly flip images
            # set rescaling factor (applied before any other transformation)
            rescale=None,
            # set function that will be applied on each input
            preprocessing_function=None,
            # image data format, either "channels_first" or "channels_last"
            data_format=None,
            # fraction of images reserved for validation (strictly between 0 and 1)
            validation_split=0.0)

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        history=model.fit(datagen.flow(x_train, y_train,
                                         batch_size=batch_size),
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            workers=4, shuffle=True, verbose=2)



    str_name = 'CIFAR100_results'+'_'+str(timestamp)+'_loss_str3_scram_ts_'+str(train_set_size)+'_trial_'+str(i)
    str_name

    with open(str_name, 'w') as outfile:
        json.dump(history.history, outfile)


x_train shape: (50000, 32, 32, 3)
5000 train samples
10000 test samples
Using real-time data augmentation.
Epoch 1/1000
157/157 - 13s - loss: 2.7072 - accuracy: 0.0112 - super1_acc: 0.0502 - super2_acc: 0.1176 - super3_acc: 0.2767 - categorical_crossentropy: 4.6090 - superclass1_loss: 2.9952 - superclass2_loss: 2.0131 - superclass3_loss: 1.2115 - struct1_loss: 3.8021 - struct2_loss: 3.2058 - struct3_loss: 2.7072 - val_loss: 2.7050 - val_accuracy: 0.0125 - val_super1_acc: 0.0553 - val_super2_acc: 0.1091 - val_super3_acc: 0.2110 - val_categorical_crossentropy: 4.6027 - val_superclass1_loss: 2.9943 - val_superclass2_loss: 2.0155 - val_superclass3_loss: 1.2076 - val_struct1_loss: 3.7985 - val_struct2_loss: 3.2041 - val_struct3_loss: 2.7050
Epoch 2/1000
157/157 - 13s - loss: 2.7034 - accuracy: 0.0154 - super1_acc: 0.0583 - super2_acc: 0.1465 - super3_acc: 0.3240 - categorical_crossentropy: 4.5979 - superclass1_loss: 2.9926 - superclass2_loss: 2.0122 - superclass3_loss: 1.2108 - struct1_loss