In [1]:
import logging
import time
import sys
import numpy as np
import matplotlib.pyplot as plt
import cgc
import dask.array as da
import json
import hashlib

from pathlib import Path
from osgeo import gdal
from dask.distributed import Client, SSHCluster
from cgc.kmeans import Kmeans
from cgc.coclustering import Coclustering

print('CGC version: {}'.format(cgc.__version__))

CGC version: 0.1.1


In [2]:
# Manual input
project_name = 'Leaf_Europe' #datasetname regionname
dir_output = Path('/data/local/home/parrot/ouku/Leaf/Europe')

# Input data 
dir_tiff = Path('/data/local/files/Leaf/Europe')
start_year = 1950
load_pattern = '*.tif'
band_id = 3 # 4th band

# Co-clustering
k = 70  # num clusters in rows
l = 5  # num clusters in columns
n_batch = 1 # total batches
nruns_per_batch = 16 # num of runs per batch, this depends on the available memory
errobj, niters, epsilon = 1e-5, 20, 10e-8


# Kmeans
kmean_n_clusters = 3
kmean_max_iter = 500
k_range = range(2,25)
variance_threshold = 8.


In [3]:
# Setup evironment

# input cache
hash_input = hashlib.sha256()
for args in (dir_tiff,start_year,load_pattern,band_id):
    hash_input.update(str(args).encode('utf-8'))
hash_input = hash_input.hexdigest()[:7]
print('Input data caching ID: {}'.format(hash_input))

# coclustering cache
hash_cc = hashlib.sha256()
for args in (k,l,n_batch,nruns_per_batch,errobj, niters, epsilon):
    hash_cc.update(str(args).encode('utf-8'))
hash_cc = hash_cc.hexdigest()[:7]
print('CoClustering caching ID: {}'.format(hash_cc))

# logging
timestamp = '{}'.format(time.strftime('%Y%m%d%H%M%S',time.localtime()))
print('Time stamp: {}'.format(timestamp))


# Path
logdir = (dir_output/'log')
resultdir = (dir_output/'results'/('results_{}').format(timestamp))
cachedir = (dir_output/'cache')
resultdir.mkdir(parents=True, exist_ok=True)
logdir.mkdir(parents=True, exist_ok=True)
cachedir.mkdir(parents=True, exist_ok=True)

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                    level=logging.DEBUG,
                    handlers=[logging.FileHandler(logdir/('{}_{}.log'.format(project_name,timestamp)), mode='w'),
                              logging.StreamHandler(stream=sys.stdout)])

Input data caching ID: 2ae4dd1
CoClustering caching ID: aaf9d5a
Time stamp: 20200904070949


In [4]:
# Load all the geotiffs
fname = cachedir/('{}_Z_{}.npy'.format(project_name, hash_input))
if not fname.exists():
    h_tif = gdal.Open(dir_tiff.as_posix() +'/{}.tif'.format(start_year))
    Z = np.empty((h_tif.RasterXSize*h_tif.RasterYSize,0))
    for f_tiff in sorted(dir_tiff.glob(load_pattern)):
        print('loading {}'.format(f_tiff))
        h_tif = gdal.Open(f_tiff.as_posix())
        img = h_tif.ReadAsArray(0, 0, h_tif.RasterXSize, h_tif.RasterYSize)[band_id]
        img = img.reshape(-1, 1)
        Z = np.append(Z, img, axis=1)
    Z = Z.astype('float64')
    print('Saving loaded tiffs to: {}'.format(fname.as_posix()))
    np.save(fname, Z)
else:
    print('loading cached data: {}'.format(fname.as_posix()))
    Z = np.load(fname)

loading cached data: /data/local/home/parrot/ouku/Leaf/Europe/cache/Leaf_Europe_Z_2ae4dd1.npy


In [5]:
# Mask out if there is nan in a row
mask = np.where(np.isnan(np.sum(Z, axis=1))==False)[0]

# Apply mask
Znp = Z[mask, :]
del Z

assert ~np.any(np.sum(np.isnan(Znp), axis=0))

In [None]:
# Co-clustering
fname = cachedir/'./{}_coclustering_{}.json'.format(project_name, hash_cc)
if not fname.exists():
    Z = da.from_array(Znp) 
    print(Z)
    
    client = Client("parrot0:9091")
    print(client)
    
    Z = client.persist(Z)
    
    cc = Coclustering(Z, k, l, errobj, niters, nruns_per_batch, epsilon, output_filename=fname)
    for b in range(n_batch):
        cc.run_with_dask(client, low_memory=True)
    row_clusters = cc.results.row_clusters
    col_clusters = cc.results.col_clusters
    client.close()
else:
    with open(fname, 'r') as f:
        print('loading cached data: {}'.format(fname.as_posix()))
        data = json.load(f)
    row_clusters = np.array(data['row_clusters'])
    col_clusters = np.array(data['col_clusters'])


In [None]:
# Kmean
km = Kmeans(Z=Znp,
            row_clusters=row_clusters,
            col_clusters=col_clusters,
            n_row_clusters=k,
            n_col_clusters=l,
            k_range=k_range,
            kmean_max_iter=kmean_max_iter,
            var_thres=variance_threshold)
km.compute()
km.plot_elbow_curve((resultdir/(project_name+'_kmean_elbow_plot')).as_posix())
km.cl_mean_centroids

In [None]:
# Export Plots
# Temporal cluster
plt.plot(range(0,len(col_clusters)),col_clusters)
plt.ylabel('Cluster')
plt.xlabel('Years')
plt.savefig((resultdir/(project_name+'_temporal_clusters')).as_posix())


In [None]:
# Spatial cluster
h_tif = gdal.Open(dir_tiff.as_posix() +'/{}.tif'.format(start_year))
spatial_cl = np.empty(h_tif.RasterXSize*h_tif.RasterYSize)
spatial_cl[:] = np.nan
spatial_cl[mask] = row_clusters
spatial_cl = spatial_cl.reshape(h_tif.RasterYSize, h_tif.RasterXSize)
plt.imshow(spatial_cl)
plt.ylabel('Yaxis')
plt.xlabel('Xaxis')
plt.savefig((resultdir/(project_name+'_spatial_clusters')).as_posix())


In [None]:
# Export Group average co-clustering
CoCavg = np.zeros((k, l)) 
row_idx = [np.argwhere(row_clusters == i).squeeze() for i in range(k)] 
col_idx = [np.argwhere(col_clusters == i).squeeze() for i in range(l)]     
for ir in range(k): 
    for ic in range(l): 
        r, c = np.meshgrid(row_idx[ir], col_idx[ic])
        # empty clusters won't be used - the actual num we use below does not matter
        CoCavg[ir, ic] = np.nan_to_num(Znp[r, c].mean())

for f in range(l):
    # Export png
    h_tif = gdal.Open(dir_tiff.as_posix() +'/{}.tif'.format(start_year))
    band_1 = np.empty(h_tif.RasterXSize*h_tif.RasterYSize)
    band_1[:] =  np.nan
    band_1[mask] = CoCavg[row_clusters, f]
    band_1 = band_1.reshape(h_tif.RasterYSize, h_tif.RasterXSize)
    plt.figure()
    plt.title('temp_clust_' + str(f))
    plt.ylabel('Yaxis')
    plt.xlabel('Xaxis')
    plt.imshow(band_1)
    plt.savefig((resultdir/(project_name+'_spatial_clusters_temp_clust_' + str(f))).as_posix())

In [None]:
# Kmean visualization
for f in range(l):
    # Export png
    h_tif = gdal.Open(dir_tiff.as_posix() +'/{}.tif'.format(start_year))
    band_1 = np.empty(h_tif.RasterXSize*h_tif.RasterYSize)
    band_1[:] =  np.nan
    band_1[mask] = km.cl_mean_centroids[row_clusters, f]
    band_1 = band_1.reshape(h_tif.RasterYSize, h_tif.RasterXSize)
    plt.figure()
    plt.title('temp_clust_' + str(f))
    plt.ylabel('Yaxis')
    plt.xlabel('Xaxis')
    plt.imshow(band_1)
    plt.savefig((resultdir/(project_name+'_kmean_spatial_clusters_temp_clust_' + str(f))).as_posix())