# t-SNE Grid for California Data
## Qualitative Evaluation of SimCLR Pretraining
## Irrigation Capstone - Fall 2020
### TP Goter

This notebook is used to first determine the latent space vectors for a set of California images. These latent space vectors are then transformed to two-dimension space using the tSNE methodology. We then plot these images in the two dimensions. This will show us how different images are arranged in latent space. If it works, it will show similar geographic images in certain regions.

In [None]:
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
import os
import tensorflow as tf
import numpy as np
from pprint import pprint
from tqdm import tqdm
import sklearn
from sklearn.manifold import TSNE
import sys
from PIL import Image
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, output_file
import bokeh
import skimage

sys.path.append('/Users/tom/Desktop/MIDS_TPG/W210/capstone_fall20_irrigation/')
import utils

print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'sci-kit learn version: {sklearn.__version__}')
print(f'Tensorflow version: {tf.__version__}')

output_notebook()

In [None]:
model_path = '../BigEarthData/models'

# List the final Big Earth Net pretrained models
pprint([file for file in os.listdir(model_path) if 'simclr_100' in file])

# List the final CA pretrained models
pprint([file for file in os.listdir(model_path) if 'ca_simclr_' in file])

In [None]:
BAND_STATS = {
        'mean': {
            'B01': 340.76769064,
            'B02': 429.9430203,
            'B03': 614.21682446,
            'B04': 590.23569706,
            'B05': 950.68368468,
            'B06': 1792.46290469,
            'B07': 2075.46795189,
            'B08': 2218.94553375,
            'B8A': 2266.46036911,
            'B09': 2246.0605464,
            'B11': 1594.42694882,
            'B12': 1009.32729131
        },
        'std': {
            'B01': 554.81258967,
            'B02': 572.41639287,
            'B03': 582.87945694,
            'B04': 675.88746967,
            'B05': 729.89827633,
            'B06': 1096.01480586,
            'B07': 1273.45393088,
            'B08': 1365.45589904,
            'B8A': 1356.13789355,
            'B09': 1302.3292881,
            'B11': 1079.19066363,
            'B12': 818.86747235
        }
    }

BAND_STATS_CA = {'mean': {'B02': 725.193505986188,
                          'B03': 1028.5459669514032,
                          'B04': 1258.9655400619445,
                          'B05': 1597.8028399130633,
                          'B06': 2170.0459291641573,
                          'B07': 2434.1251301748134,
                          'B08': 2613.2817721668257,
                          'B8A': 2672.539516996118,
                          'B11': 2833.482510348869,
                          'B12': 2104.7903924463503},
              'std': {'B02': 416.6137845190807,
                          'B03': 499.6087245377614,
                          'B04': 693.5558604814064,
                          'B05': 640.6865473157832,
                          'B06': 676.3993986790316,
                          'B07': 795.1209667456519,
                          'B08': 839.6670833859841,
                          'B8A': 821.8303575104553,
                          'B11': 975.7944412326585,
                          'B12': 928.1875779697522}}


In [None]:
def generate_tsne_grid(model_path, saved_model, files, num_images, batch_size, ca_flag, bokeh_flag, label, output):
    
    SCALE_FACTOR = 3000
    
    def get_training_dataset(files, batch_size, ca_flag):
      return utils.get_batched_dataset(files, batch_size, ca=ca_flag)

    # Get the data
    data = get_training_dataset(files, batch_size, ca_flag)

    loaded_model = tf.keras.models.load_model(os.path.join(model_path, saved_model))
    loaded_model.summary()
    
    def denorm_img(img):
        
        if ca_flag:
            band_stats = BAND_STATS_CA
        else:
            band_stats = BAND_STATS
        
        return np.stack([(img[:,:,0]* band_stats['std']['B04']+ band_stats['mean']['B04'])/ SCALE_FACTOR,
                        (img[:,:,1]* band_stats['std']['B03']+ band_stats['mean']['B03'])/ SCALE_FACTOR,
                        (img[:,:,2]* band_stats['std']['B02']+ band_stats['mean']['B02'])/ SCALE_FACTOR], axis=2)
    def rgb_to_rgba32(img):
        """
        Convert an RGB image to a 32 bit-encoded RGBA image.
        """
        img = denorm_img(img)
        # Ensure it has three channels
        if len(img.shape) != 3 or img.shape[2] !=3:
            raise RuntimeError('Input image is not RGB.')

        # Get image shape
        n, m, _ = img.shape

        # Convert to 8-bit, which is expected for viewing
        im_8 = np.uint8(img*255)

        # Add the alpha channel, which is expected by Bokeh
        im_rgba = np.dstack((im_8, 255*np.ones_like(im_8[:,:,0])))

        # Reshape into 32 bit. Must flip up/down for proper orientation
        return np.flipud(im_rgba.view(dtype=np.int32).reshape(n, m))
    
    
    # Loop over the batches and grab the latent vectors and image vectors
    count = 0
    preds = []
    images = []
    for image_batch, label_batch in data: 
        count += batch_size
        preds.append(loaded_model.predict(image_batch))
        images.append(image_batch)
        if count >= num_images:
            break
            
    X = np.concatenate(preds)
    print(f'Activation Vector Shape: {X.shape}')
    images = np.concatenate(images)
    print(f'Image Vector Shape: {images.shape}')
    
    tsne = TSNE(n_components=2, learning_rate=150, perplexity=30, angle=0.2, verbose=2).fit_transform(X)
    
    tx, ty = tsne[:,0], tsne[:,1]
    tx = (tx-np.min(tx)) / (np.max(tx) - np.min(tx))
    ty = (ty-np.min(ty)) / (np.max(ty) - np.min(ty))
    
    if bokeh_flag:
        
        p_width = 800
        p_height = 600
        p = figure(plot_height=p_height, plot_width = p_width,
                   x_range =[0,p_width], y_range=[0,p_height],
                   tools='pan,box_zoom,wheel_zoom,reset')
        for img, x, y in zip(images, tx, ty):
            im_disp = rgb_to_rgba32(img)
            n, m = im_disp.shape
            p.image_rgba(image=[im_disp],
                          x=int((p_width-m)*x),
                          y=int(p_height-n-(p_height-n)*y),
                           dw=m/5,dh=n/5)
        output_file(f"../images/{output}.html", title=label)
        bokeh.io.show(p)
        
        
    else:
        width = 4000
        height = 3000
        max_dim = 100

        full_image = Image.new('RGBA', (width, height))
        for img, x, y in zip(images, tx, ty):
            tile = Image.fromarray(np.uint8(denorm_img(img)*255))
            rs = max(1, tile.width/max_dim, tile.height/max_dim)
            tile = tile.resize((int(tile.width/rs), int(tile.height/rs)), Image.ANTIALIAS)
            full_image.paste(tile, (int((width-max_dim)*x), int((height-max_dim)*y)), mask=tile.convert('RGBA'))

        plt.figure(figsize = (16,12))
        plt.imshow(full_image)
        

## Run for BigEarthNet

In [None]:
file = 'simclr_100_t3_s50_10.h5'
train_files = '../BigEarthData/tfrecords/train-part-*'
generate_tsne_grid(model_path, file, train_files,
                   num_images=1024, batch_size=32,
                   ca_flag=False, bokeh_flag=True,
                   label='BigEarthNet 1024 - SimCLR 10 Epochs',
                   output='bigearthnet_simclr_e10')

## Run for California

In [None]:
file = 'ca_simclr_s50_t1_50.h5'
train_files = '../CaliforniaData/tfrecords/train_ca*'
generate_tsne_grid(model_path, file, train_files, 
                   num_images=1024, batch_size=32, 
                   ca_flag=True, bokeh_flag=True,
                   label='California - SimCLR 50 Epochs',
                   output='california_simclr_e50_t1')