In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras import layers
from tensorflow import keras
import tensorflow_addons as tfa

In [14]:
batch_size = 128
img_height = 256
img_width = 256
train_datagen = ImageDataGenerator(rescale=1./255,
        
        
        horizontal_flip=True,
        rotation_range=0.1,
        zoom_range=0.2,
    
    validation_split=0.2) # set validation split

train_generator = train_datagen.flow_from_directory(
    '/kaggle/input/intel-image-classification/seg_train/seg_train',
    target_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=True,
    seed=42,
    
    subset='training') # set as training data

validation_generator = train_datagen.flow_from_directory(
    '/kaggle/input/intel-image-classification/seg_train/seg_train', # same directory as training data
    target_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=True,
    seed=42,
    
    subset='validation') # set as validation data


Found 11230 images belonging to 6 classes.
Found 2804 images belonging to 6 classes.


In [15]:
def mlp(x, size, dropoutrate):
    
    x = layers.Dense(size, activation=tf.nn.gelu)(x)
    x = layers.Dropout(dropoutrate)(x)
    return x

In [16]:
class Patches(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        # return batch size of batchsize, row of patches, columns of patchs, size of patches**2 *3
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        
        patch_dims = patches.shape[-1]
        
        # this will reshape the patches instead of being x *x will now be X**2 in size so batchsize, x**2, size of patches**2 *3
        patches = tf.reshape(patches, [batch_size, 256, patch_dims])
       
        return patches

In [17]:
import cv2
img = cv2.imread('/kaggle/input/intel-image-classification/seg_train/seg_train/buildings/10006.jpg')
img.shape
img = img[np.newaxis,:,:,:]
img.shape

(1, 150, 150, 3)

In [18]:
class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

In [19]:
def create_vit():
    inputs = layers.Input(shape=(256,256,3))
    # Augment data.
    
    # Create patches.
    patches = Patches(16)(inputs)
    # Encode patches.
    encoded_patches = PatchEncoder(256, 64)(patches)
    #transformer layers below
    for _ in range(8):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=6, key_dim=64, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, size=128, dropoutrate=0.1)
        x4 = mlp(x3, size=64, dropoutrate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x4, x2])
        
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    # Add MLP.
    
    features = mlp(representation, size=2048, dropoutrate=0.5)
    features = mlp(features, size=1024, dropoutrate=0.5)
    # Classify outputs.
    logits = layers.Dense(6)(features)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

In [20]:
model = create_vit()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 256, 256, 3  0           []                               
                                )]                                                                
                                                                                                  
 patches_1 (Patches)            (None, 256, 768)     0           ['input_2[0][0]']                
                                                                                                  
 patch_encoder_1 (PatchEncoder)  (None, 256, 64)     65600       ['patches_1[0][0]']              
                                                                                                  
 layer_normalization_17 (LayerN  (None, 256, 64)     128         ['patch_encoder_1[0][0]']  

In [21]:
optimizer = tfa.optimizers.AdamW(
        learning_rate=0.001, weight_decay=0.0001
    )

model.compile(
        optimizer=optimizer,
        loss=keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics='Accuracy'
        
    )

In [22]:
#model.fit(train_generator,validation_data =validation_generator,epochs = 3)
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=validation_generator.n//validation_generator.batch_size
model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=validation_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=10
)

  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

KeyboardInterrupt: 

In [13]:
model.save('test-vision-encoder')

In [15]:
from IPython.display import FileLink
import shutil
shutil.make_archive('test-vision-encoder-output', 'zip', '/kaggle/working/test-vision-encoder')


'/kaggle/working/test-vision-encoder-output.zip'

In [19]:
FileLink('/kaggle/working/test-vision-encoder-output.zip')

In [23]:
model = keras.models.load_model('/kaggle/input/vision-encoder-model')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256, 256, 3  0           []                               
                                )]                                                                
                                                                                                  
 patches (Patches)              (None, 256, 768)     0           ['input_1[0][0]']                
                                                                                                  
 patch_encoder (PatchEncoder)   (None, 256, 64)      65600       ['patches[0][0]']                
                                                                                                  
 layer_normalization (LayerNorm  (None, 256, 64)     128         ['patch_encoder[0][0]']      