In [1]:
import numpy as np
import os
import math
from PIL import Image
import pandas as pd
import cv2

# Clustering
from sklearn.cluster import AgglomerativeClustering # 1) Agglomerative-Hierarchical
from sklearn.cluster import KMeans                  # 2) K-Means
from sklearn.mixture import GaussianMixture         # 3) Gaussian Mixture Models
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
# Evaluation
from sklearn import metrics
from skimage.external import tifffile
from skimage.measure import compare_mse
from skimage.metrics import structural_similarity as ssim

from sklearn.decomposition import PCA, KernelPCA               # 2) PCA
from sklearn.manifold import TSNE

# Load Autoencoder                                  # 3) Autoencoder (Deep dimensionality reduction)
from keras.models import load_model
from keras.models import Model

import matplotlib.pyplot as plt 
%matplotlib inline

import utilities as myutils # Software package utilities

Using TensorFlow backend.


In [2]:
import shutil

def symlink_rel(src, dst):
    rel_path_src = os.path.relpath(src, os.path.dirname(dst))
    os.symlink(rel_path_src, dst)
    
def clusterintoDirectories(labels, path, imagenamesList):
    # remove existing subdirectories first to avoid overlap
    sub_directories = [str(i) for i in range(8)]

    for cluster in sub_directories:
        if (cluster in os.listdir(path)) and (os.path.isdir(os.path.join(path , cluster))):
            shutil.rmtree(os.path.join(path , cluster))

    for filename in os.listdir(path):
        if filename.endswith('.tif'):
            for cluster in sub_directories: # count of distinct elements = no. of clusters
                os.makedirs(path + '/{}'.format(cluster) , exist_ok=True)
    
    
    for i in range(len(imagenamesList)):
        # if there isnt already a symlink of this image in the coressponding subdirectory
        if imagenamesList[i] not in os.listdir(path + '/' + sub_directories[labels[i]]): 
            symlink_rel(path + '/{}'.format(imagenamesList[i]) , 
                       path + '/{}'.format(labels[i]) + '/' + imagenamesList[i])

In [3]:
# Load training data

train_directory = "../../Data/Kather_all_train"

X_train = []
train_image_names = []

for filename in os.listdir(train_directory):
    if filename.endswith('tif'):
        image = cv2.imread(os.path.join(train_directory, filename), cv2.IMREAD_UNCHANGED)
        resized_image = cv2.resize(image, (144, 144)) 
        X_train.append(np.asarray( resized_image, dtype="uint8" ))
        train_image_names.append(filename)

y_train = myutils.loadLabelsFromsubdirectoryindex(train_image_names, "../../Data/Kather_original_train")

# Load testing data

test_directory = "../../Data/Kather_all_test"

X_test = []
test_image_names = []

for filename in os.listdir(test_directory):
    if filename.endswith('tif'):
        image = cv2.imread(os.path.join(test_directory, filename), cv2.IMREAD_UNCHANGED)
        resized_image = cv2.resize(image, (144, 144)) 
        X_test.append(np.asarray( resized_image, dtype="uint8" ))
        test_image_names.append(filename)

y_test = myutils.loadLabelsFromsubdirectoryindex(test_image_names, "../../Data/Kather_original_test")

In [4]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

# Normalise
X_train = X_train.astype('float32') / 255.
X_test = X_test.astype('float32') / 255.

In [5]:
CAER_MSE_KATHER = load_model('../Autoencoders/SavedModels/weights/finalists/CAER_MSE_KATHER.h5', compile=False) # 6 x 6 x 16
# autoencoder.summary()

In [7]:
layer_name = 'conv2d_43' # 6 x 6 x 32
CAER_MSE_KATHER_encoder = Model(inputs=CAER_MSE_KATHER.input, outputs=CAER_MSE_KATHER.get_layer(layer_name).output)

In [9]:
encoder = CAER_MSE_KATHER_encoder

In [10]:
X_train_enc = encoder.predict(X_train)
print(X_train_enc.shape)

X_test_enc = encoder.predict(X_test)
print(X_test_enc.shape)

X_train_enc = X_train_enc.reshape(X_train_enc.shape[0] , -1) # Reshape for scaling
X_train_enc = StandardScaler().fit_transform(X_train_enc) # Scale

X_test_enc = X_test_enc.reshape(X_test_enc.shape[0] , -1) # Reshape for scaling
X_test_enc = StandardScaler().fit_transform(X_test_enc) # Scale

# Evaluate on testing data using grid search  cv
parameters = {'covariance_type':('full', 'spherical', 'diag', 'tied'), 'n_components':[8],'random_state':[0,19,42]}
clf = GridSearchCV(GaussianMixture(), parameters, scoring=make_scorer(metrics.completeness_score))

clf.fit(X_train_enc, y_train)
print(clf.best_estimator_)
print(clf.best_score_)

parameters = {'init':('k-means++', 'random'), 'n_clusters':[8],'random_state':[0,19,42]}
clf2 = GridSearchCV(KMeans(), parameters, scoring=make_scorer(metrics.completeness_score))

clf2.fit(X_train_enc, y_train)
print(clf2.best_estimator_)
clf2.best_score_

(4200, 9, 9, 32)
(800, 9, 9, 32)
GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=8, n_init=1, precisions_init=None,
                random_state=42, reg_covar=1e-06, tol=0.001, verbose=0,
                verbose_interval=10, warm_start=False, weights_init=None)
0.7193166927292138
KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300, n_clusters=8,
       n_init=10, n_jobs=None, precompute_distances='auto', random_state=0,
       tol=0.0001, verbose=0)


0.5545186268787916

In [12]:
# PCA 
# Reduce Dimensions
# PCA 
pca1 = PCA(n_components=0.96)
transformed_train = pca1.fit_transform(X_train_enc)

pca2 = PCA(n_components=0.96)
transformed_test = pca2.fit_transform(X_test_enc)

# print(sum(pca1.explained_variance_ratio_))
# print(sum(pca2.explained_variance_ratio_))
print(pca1.n_components_)
pca2.n_components_

261


193