In [1]:
%load_ext autoreload
%autoreload 2
import os
import random

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds
import keras
from keras import layers, optimizers, losses, metrics, callbacks, ops
from PIL import Image

from swin_transformer import SwinTransformer

2024-01-11 08:06:35.206434: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
plt.style.use("seaborn-v0_8")

2024-01-11 08:06:37.246510: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-11 08:06:37.270455: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-11 08:06:37.270713: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

## Dataset

In [3]:
DATASET_PATH = "/mnt/dl/datasets/Oxford102FlowersSplits/"
LABELS = {i: k.strip() for i, k in enumerate(open(os.path.join(DATASET_PATH, "names.txt")))}
batch_size = 64
img_size = 224
SIZE = 128
batch_size = 32
num_classes = len(LABELS)
patch_size = 16
num_patches = img_size ** 2 / patch_size **2

In [4]:
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

In [5]:
def load_dataset(split):
    
    def load_img(img_fname):
        img_bytes = tf.io.read_file(img_fname)
        img = tf.io.decode_jpeg(img_bytes)
        img = tf.image.resize(img, (img_size, img_size))
        img = tf.cast(img, tf.float32)
        return img
        
    path = os.path.join(DATASET_PATH, split, )
    img_files = os.listdir(os.path.join(path, "jpeg"))
    img_files = sorted(img_files, key=lambda x: int(x.replace(".jpeg", "")))
    img_files = list(img_files)[:SIZE]
    
    labels = list(open(os.path.join(path, "label", "label.txt"),))
    labels = [int(l.strip()) for l in labels][:SIZE]
    
    img_files = [os.path.join(path, "jpeg", name) for name in img_files]
    
    img_ds = tf.data.Dataset.from_tensor_slices(img_files).map(load_img).cache()
    label_ds = tf.data.Dataset.from_tensor_slices(labels).cache()
    ds = tf.data.Dataset.zip((img_ds, label_ds))
    return ds

In [6]:
train_ds = load_dataset("train")
validation_ds = load_dataset("validation")
test_ds = load_dataset("test")

2024-01-11 08:06:37.462376: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-11 08:06:37.462625: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-11 08:06:37.462781: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [7]:
aug_layers = [layers.RandomRotation(0.1), layers.RandomFlip()]

def preprocess(img, label, training):
    if training:
        for aug in aug_layers:
            img = aug(img)
    return tf.cast(img, tf.float32), label
    
train_ds = train_ds.shuffle(buffer_size=2048).map(lambda img, label: preprocess(img, label, training=True), num_parallel_calls=5).batch(batch_size)
validation_ds = validation_ds.map(lambda img, label: preprocess(img, label, training=False), num_parallel_calls=5).batch(batch_size)
test_ds = test_ds.map(lambda img, label: preprocess(img, label, training=False), num_parallel_calls=5).batch(batch_size)

## MLP Mixer

In [8]:
class PatchLayer(keras.Layer):
    
    def __init__(self,  patch_size, **kwargs):
        super().__init__(**kwargs)
        self.patch_size = patch_size
        
    
    def build(self, input_shape):
        img_size = input_shape[1]
        num_patches = img_size // self.patch_size
        assert num_patches * self.patch_size == img_size
        idx = ops.arange(img_size)
        idx_splits = ops.split(idx, num_patches)
        idx_splits = ops.stack(idx_splits)

        indices = []
        for col in range(num_patches):
            for row in range(num_patches):
                patch_idx = ops.meshgrid(idx_splits[row], idx_splits[col], indexing="xy")
                patch_idx = ops.stack(patch_idx, axis=2)
                indices.append(patch_idx)
        assert len(indices) == num_patches ** 2
        indices = tf.stack(indices, 0)
        self.indices = indices
        return super().build(input_shape)

    def compute_output_shape(self, input_shape):
        bz, h, w, c = input_shape
        num_patches = h // self.patch_size
        return (bz, num_patches ** 2, self.patch_size, self.patch_size, c)
        

    def gather_patches(self, arg):
        x, idx = arg
        return tf.gather_nd(x, 
                            idx,
                            batch_dims=0
                            )
    
    def call(self, x):
        bz, *_ = ops.shape(x)
        pz, h, w, *_ = self.indices.shape
        idx = ops.reshape(self.indices, (-1, 2))
        idx = ops.expand_dims(idx, 0)
        x = ops.vectorized_map(self.gather_patches, (x, idx) )
        x = ops.reshape(x, (bz, pz, h, w, -1))
        return x


In [9]:
class MLPMixer(keras.Layer):
    
    def __init__(self, dc, ds, **kwargs):
        super().__init__(**kwargs)
        self.dc = dc
        self.ds = ds
    
    def build(self, input_shape):   
        bz, sz, c =  input_shape     
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()
        
        self.mlp1 = keras.Sequential([
            layers.Dense(self.dc, activation="gelu"),
            layers.Dropout(0.1),
            layers.Dense(sz),
        ])
        
        self.mlp2 = keras.Sequential([
            layers.Dense(self.ds, activation="gelu"),
            layers.Dropout(0.1),
            layers.Dense(c),
        ])

    def call(self, x):
        prev_x = x
        x = self.ln1(x)
        x = ops.transpose(x, (0, 2, 1))  
        x = self.mlp1(x)
        x = ops.transpose(x, (0, 2, 1))
        x = ops.add(x, prev_x)
        
        prev_x = x
        x = self.ln2(x)
        x = self.mlp2(x)
        x = ops.add(x, prev_x)
        
        return x
        

def build_mlp_vision_model(input):
    x = layers.Rescaling(1/255.0)(input)
    
    x = PatchLayer(patch_size)(x)
    
    bz, sz, h, w, c = ops.shape(x)
    x = layers.Reshape((sz, h * w * c))(x)
    x = layers.Dense(512, activation="gelu")(x)
    x = layers.Rescaling( 1 / np.sqrt(512.))(x)
    
    for i in range(8):
        x = MLPMixer(1024, 256)(x)
    
    x = layers.GlobalAveragePooling1D()(x)
    
    x = keras.Sequential([
        layers.Dense(512),
        layers.Activation("gelu"),
        layers.Dropout(0.1),
        layers.Dense(num_classes)
    ])(x)
    
    return x

In [10]:
input = keras.Input(shape=(img_size, img_size, 3))
mlp_mixer_out = build_mlp_vision_model(input)
mlp_mixer_model = keras.Model(input, mlp_mixer_out)
mlp_mixer_model.summary()


In [11]:
mlp_mixer_model.compile(optimizer=optimizers.Adam(1e-3),
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[metrics.SparseCategoricalAccuracy("acc"),
                       ]
              )


In [12]:
history = mlp_mixer_model.fit(train_ds, epochs=20, validation_data=validation_ds)

Epoch 1/20


2024-01-09 18:53:50.716531: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [128]
	 [[{{node Placeholder/_0}}]]
2024-01-09 18:53:50.716863: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype int32 and shape [128]
	 [[{{node Placeholder/_6}}]]
2024-01-09 18:53:50.845654: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'pfor/Reshape/functional_18_1/patch_layer_1/Max' with

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1:27[0m 29s/step - acc: 0.0000e+00 - loss: 4.9758

2024-01-09 18:54:19.834770: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - acc: 0.0189 - loss: 4.9422 

2024-01-09 18:54:20.250143: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [128]
	 [[{{node Placeholder/_0}}]]
2024-01-09 18:54:20.250412: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [128]
	 [[{{node Placeholder/_0}}]]
2024-01-09 18:54:20.285700: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'pfor/Reshape/functional_18_1/patch_layer_1/Max' wit

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1s/step - acc: 0.0198 - loss: 4.9296 - val_acc: 0.0156 - val_loss: 4.7133
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 156ms/step - acc: 0.0146 - loss: 4.5363 - val_acc: 0.0078 - val_loss: 4.8333
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 153ms/step - acc: 0.0052 - loss: 4.3864 - val_acc: 0.0078 - val_loss: 4.9633
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 155ms/step - acc: 0.0427 - loss: 4.3509 - val_acc: 0.0156 - val_loss: 5.0164
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 151ms/step - acc: 0.0260 - loss: 4.4185 - val_acc: 0.0312 - val_loss: 5.2508
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 156ms/step - acc: 0.0146 - loss: 4.3928 - val_acc: 0.0156 - val_loss: 5.5297
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 150ms/step - acc: 0.0292 - loss: 4.3989 

## Conv Mixer

In [13]:
patch_size = 7

In [14]:


class PatchEmbedding(layers.Layer):
    
    def __init__(self, filters, **kwargs):
        super().__init__(**kwargs)
        self.patch = PatchLayer(patch_size)
        self.conv = layers.Conv2D(filters, patch_size, strides=patch_size, activation="gelu")
        self.bn = layers.BatchNormalization()
           
    def call(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return x


class ConvMixerLayer(layers.Layer):
    
    def __init__(self,kernel, **kwargs):
        super().__init__(**kwargs)
        self.kernel = kernel
    
    def build(self, input_shape):
        self.conv1 = layers.DepthwiseConv2D(self.kernel, padding="same")
        self.conv2 = layers.Conv2D(input_shape[-1], 1, )
        
        self.act1 = keras.Sequential([
            layers.Activation("gelu"),
            layers.BatchNormalization()
        ])
        
        self.act2 = keras.Sequential([
            layers.Activation("gelu"),
            layers.BatchNormalization()
        ])
    
    def call(self, x):
        prev_x = x
        x = self.conv1(x)
        x = self.act1(x)
        x = ops.add(x, prev_x)
        
        x = self.conv2(x)
        x = self.act2(x)
        return x
        
    
    
class ConvMixer(layers.Layer):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def build(self, input_shape):
        self.rescaling = layers.Rescaling(1 / 255.0)
        self.patch_embedding = PatchEmbedding(512)
        self.act1 = keras.Sequential([
            layers.Activation("gelu"),
            layers.BatchNormalization()
        ])
        
        self.conv_mix_net = keras.Sequential([ConvMixerLayer(patch_size)
                                              for _ in range(8)
                                              ])
        self.pooling = layers.GlobalAveragePooling2D()
        
        self.classifier = keras.Sequential([
                            layers.Dense(1024),
                            layers.Activation("gelu"),
                            layers.Dropout(0.1),
                            layers.Dense(num_classes)
              ])
    
    def call(self, x):
        x = self.rescaling(x)
        x = self.patch_embedding(x)
        x = self.act1(x)
        x = self.conv_mix_net(x)
        x = self.pooling(x)
        x = self.classifier(x)
        
        return x
        

In [15]:
input_shape = keras.Input((img_size, img_size, 3))
cm_out = ConvMixer()(input)
conv_mixer = keras.Model(input, cm_out)
conv_mixer.summary()



In [16]:
conv_mixer.compile(optimizer=optimizers.Adam(1e-3),
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[metrics.SparseCategoricalAccuracy("acc"),
                       ]
              )
history = conv_mixer.fit(train_ds, epochs=20, validation_data=validation_ds)

Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 729ms/step - acc: 0.0365 - loss: 4.6271 - val_acc: 0.0078 - val_loss: 4.6280
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 222ms/step - acc: 0.1823 - loss: 3.8852 - val_acc: 0.0078 - val_loss: 4.6326
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 235ms/step - acc: 0.2750 - loss: 2.8799 - val_acc: 0.0078 - val_loss: 4.6403
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 226ms/step - acc: 0.3906 - loss: 2.2654 - val_acc: 0.0078 - val_loss: 4.6533
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 235ms/step - acc: 0.5292 - loss: 1.6624 - val_acc: 0.0078 - val_loss: 4.6727
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 236ms/step - acc: 0.5781 - loss: 1.3991 - val_acc: 0.0078 - val_loss: 4.6942
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 243ms/step - acc: 0.7437 -