<a href="https://colab.research.google.com/github/olley102/CommunityDetectionURSS/blob/main/autoencoder_image_segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Autoencoder image segmentation
This is an attempt to implement an idea to use autoencoders to encode local spatial information of images to then apply clustering.

In [None]:
import math
import numpy as np
import keras.backend as K
import tensorflow as tf

In [None]:
from keras import Sequential, Input, Model
from keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Conv2DTranspose
from keras.layers import Activation, Flatten, Dense, Reshape

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

In [None]:
class WindowAE:
  def __init__(self, window_size=(7, 7), num_channels=1, encoder_sizes=None,
               decoder_sizes=None):
    self.model = None
    self.encoder = None
    self._built = False
    self.max = 0.0
    self.min = 0.0
    self.window_size = window_size
    self.num_channels = num_channels
    self.encoder_sizes = encoder_sizes
    self.decoder_sizes = decoder_sizes
  
  def make(self):
    if (
        not self._built and
        self.encoder_sizes is not None and
        self.decoder_sizes is not None
    ):
      stack_size = self.window_size + (self.num_channels+2,)
      input_window = Input(stack_size)

      x = Flatten()(input_window)
      for s in self.encoder_sizes:
        x = Dense(s, activation='relu')(x)
      
      encoded = Activation('linear')(x)

      for s in self.decoder_sizes:
        x = Dense(s, activation='relu')(x)
      
      decoded = Reshape(stack_size)

      self.model = Model(input_window, decoded)
      self.encoder = Model(input_window, encoded)
  
  def compile(self):
    self.model.compile(loss='mse', optimizer='adam', metrics=['mse'])
  
  def fit_transform(self, x):
    self.max = np.max(x)
    self.min = np.min(x)
  
  def transform_x(self, x, train=False):
    # Normalize.
    x_norm = (x - self.min) / (self.max - self.min)

    # Zero padding.
    pad = (*(math.floor(s/2) for s in self.window_size), 0)
    x_pad = np.pad(x_norm, pad)

    # Stack array with positional information.
    pos_i = np.arange(-pad[0], x.shape[0]+pad[0], dtype='float')
    pos_j = np.arange(-pad[1], x.shape[1]+pad[1], dtype='float')
    x_i = np.outer(pos_i, np.ones(x.shape[1], dtype='float'))
    x_j = np.outer(np.ones(x.shape[0], dtype='float'), pos_j)
    x_full = np.dstack((x, x_i, x_j))

    # TODO: For each pixel make windows. Need to be done in a for loop. If done
    # with numpy, RAM will fill up very quickly. If window_size=(7,7) and x
    # is size 1000x1000 with 3 channels then you are looking at full data of
    # size (7,7,5,1003*1003), which is too big to store. Train and prediction
    # processes must involve storing and discarding a number of small stacks,
    # say (7,7,5,100), and number of epochs in training determines number of
    # stacks.
    # Remember: padding pixels cannot be central.

    return x_full
  
  def transform_y(self, y):
    return self.transform_x(y, train=True)
  
  def encode(self, x):
    x1 = self.transform_x(x, train=False)
    return self.encoder.predict(x1)