In [6]:
import numpy as np
import os
import math
from PIL import Image
import pandas as pd
import cv2

# Clustering
from sklearn.cluster import AgglomerativeClustering # 1) Agglomerative-Hierarchical
from sklearn.cluster import KMeans                  # 2) K-Means
from sklearn.mixture import GaussianMixture         # 3) Gaussian Mixture Models
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
# Evaluation
from sklearn import metrics
from skimage.external import tifffile
from skimage.measure import compare_mse
from skimage.metrics import structural_similarity as ssim

from sklearn.decomposition import PCA, KernelPCA               # 2) PCA
from sklearn.manifold import TSNE

# Load Autoencoder                                  # 3) Autoencoder (Deep dimensionality reduction)
from keras.models import load_model
from keras.models import Model

import matplotlib.pyplot as plt 
%matplotlib inline

import newscripts as myutils

In [7]:
def pltPathologyClusters(labels, path):
    sub_directories = [str(cluster) for cluster in set(labels)]
    displayImages = []
    fig = plt.figure(figsize=(14, 14))
    
    x=1
    
    for cluster in sub_directories:
        direct = path + '/{}'.format(cluster)
        if len(os.listdir(direct))-9 > 9: # if directory has less than 9 images set index to 0 else random index
            index = np.random.randint(9,len(os.listdir(direct))-9)
        else:
            index = 0 # pick the first 10 images
        clusterList = [] # reset the row
        for file in os.listdir(direct)[index:index+9]: # random sample of 9 images
            if file.endswith('.tif'):
                image = tifffile.imread(os.path.join(path, file))
                clusterList.append(image)
                displayImages.append(image) # list of ALL Images
                
        for i in range(1,9+1):
            if (len(clusterList) > i):
                img = clusterList[i-1]
            fig.add_subplot(8, 9, x)
            plt.imshow(img)
            plt.axis('off')
            plt.subplots_adjust(wspace=0.0, hspace=0.0)
            x+=1
    plt.show()
    

In [4]:
import shutil

def symlink_rel(src, dst):
    rel_path_src = os.path.relpath(src, os.path.dirname(dst))
    os.symlink(rel_path_src, dst)
    
def clusterintoDirectories(labels, path, imagenamesList):
    # remove existing subdirectories first to avoid overlap
    sub_directories = [str(i) for i in range(8)]

    for cluster in sub_directories:
        if (cluster in os.listdir(path)) and (os.path.isdir(os.path.join(path , cluster))):
            shutil.rmtree(os.path.join(path , cluster))

    for filename in os.listdir(path):
        if filename.endswith('.tif'):
            for cluster in sub_directories: # count of distinct elements = no. of clusters
                os.makedirs(path + '/{}'.format(cluster) , exist_ok=True)
    
    for i in range(len(imagenamesList)):
        if imagenamesList[i] not in os.listdir(path + '/' + sub_directories[labels[i]]): 
            symlink_rel(path + '/{}'.format(imagenamesList[i]) , 
                       path + '/{}'.format(labels[i]) + '/' + imagenamesList[i])

In [2]:
# Kather
# Load training data

train_directory = "../../Data/Kather_all_train"

X_train = []
train_image_names = []

for filename in os.listdir(train_directory):
    if filename.endswith('tif'):
        image = cv2.imread(os.path.join(train_directory, filename), cv2.IMREAD_UNCHANGED)
        resized_image = cv2.resize(image, (96, 96)) 
        X_train.append(np.asarray( resized_image, dtype="uint8" ))
        train_image_names.append(filename)

y_train = myutils.loadLabelsFromsubdirectoryindex(train_image_names, "../../Data/Kather_original_train")

# Load testing data

test_directory = "../../Data/Kather_all_test"

X_test = []
test_image_names = []

for filename in os.listdir(test_directory):
    if filename.endswith('tif'):
        image = cv2.imread(os.path.join(test_directory, filename), cv2.IMREAD_UNCHANGED)
        resized_image = cv2.resize(image, (96, 96)) 
        X_test.append(np.asarray( resized_image, dtype="uint8" ))
        test_image_names.append(filename)

y_test = myutils.loadLabelsFromsubdirectoryindex(test_image_names, "../../Data/Kather_original_test")

In [None]:
# Kaggle

In [3]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

# Normalise
X_train = X_train.astype('float32') / 255.
X_test = X_test.astype('float32') / 255.

In [8]:
def plotdiffTsne(X):
    fig, axs = plt.subplots(2, 2, figsize=(13, 7))

    tsne2 = TSNE(n_components=2, perplexity = 30).fit_transform(X)
    kmeans2 = KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300, n_clusters=8,
       n_init=10, n_jobs=None, precompute_distances='auto', random_state=19,
       tol=0.0001, verbose=0).fit(tsne2)

    axs[0, 1].scatter(tsne2[:, 0], tsne2[:, 1], c=kmeans2.labels_, cmap='viridis')
    axs[0, 1].set_title('TSNE perplexity = 30')

    tsne4 = TSNE(n_components=2, perplexity = 90).fit_transform(X)
    kmeans4 = KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300, n_clusters=8,
       n_init=10, n_jobs=None, precompute_distances='auto', random_state=19,
       tol=0.0001, verbose=0).fit(tsne4)

    axs[1, 1].scatter(tsne4[:, 0], tsne4[:, 1], c=kmeans4.labels_, cmap='viridis')
    axs[1, 1].set_title('TSNE perplexity = 90')
    
    plt.savefig('TSNE-CAE.png')
    plt.show()

In [25]:
CAESSIM_Kather_16 = load_model('../Autoencoders/SavedModels/weights/CAE-SSIM/23FebruarySSIM_epochs1000_batch16_adam.h5', compile=False)
encoder = Model(inputs=CAESSIM_Kather_16.input, outputs=CAESSIM_Kather_16.get_layer('conv2d_7').output)
# CAESSIM_Kather_16.summary()

In [28]:
CAER_MSE_Kaggle_32 = load_model('../Autoencoders/SavedModels/weights/CAER-MSE-Kaggle/epochs300_batch64_0.6023092042605083.h5')
encoder = Model(inputs=CAER_MSE_Kaggle_32.input, outputs=CAER_MSE_Kaggle_32.get_layer('conv2d_103').output)
# CAER_MSE_Kaggle_32.summary()

In [29]:
def evaluate_two_classes(model, data, labels_pred, truelabels):
    labels_pred = model.predict(data)[1]
    for i in range(len(labels_pred)):
        if labels_pred[i] < 0.5:
            labels_pred[i] = 0
        else:
            labels_pred[i] = 1

    print(metrics.classification_report(truelabels,labels_pred))

KeyboardInterrupt: 

In [None]:
autoencoder3 = load_model('../Autoencoders/SavedModels/new_mean_squared_error_epochs300_batch64.h5')
# encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer(layer_name).output)
autoencoder.summary()

In [5]:
X_train_enc = encoder.predict(X_train)
print(X_train_enc.shape)

(4200, 6, 6, 32)


In [6]:
X_test_enc = encoder.predict(X_test)
print(X_test.shape)

X_train_enc = X_train_enc.reshape(X_train_enc.shape[0] , -1) # Reshape for scaling
X_train_enc = StandardScaler().fit_transform(X_train_enc) # Scale

X_test_enc = X_test_enc.reshape(X_test_enc.shape[0] , -1) # Reshape for scaling
X_test_enc = StandardScaler().fit_transform(X_test_enc) # Scale

(800, 96, 96, 3)


In [7]:
# Evaluate on testing data using grid search  cv
parameters = {'covariance_type':('full', 'spherical', 'diag', 'tied'), 'n_components':[8],'random_state':[0,19,42]}
clf = GridSearchCV(GaussianMixture(), parameters, scoring=make_scorer(metrics.completeness_score))

clf.fit(X_train_enc, y_train)
print(clf.best_estimator_)
print(clf.best_score_)

parameters = {'init':('k-means++', 'random'), 'n_clusters':[8],'random_state':[0,19,42]}
clf2 = GridSearchCV(KMeans(), parameters, scoring=make_scorer(metrics.completeness_score))

clf2.fit(X_train_enc, y_train)
print(clf2.best_estimator_)
clf2.best_score_

GaussianMixture(covariance_type='diag', init_params='kmeans', max_iter=100,
                means_init=None, n_components=8, n_init=1, precisions_init=None,
                random_state=19, reg_covar=1e-06, tol=0.001, verbose=0,
                verbose_interval=10, warm_start=False, weights_init=None)
0.542857335581645
KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300, n_clusters=8,
       n_init=10, n_jobs=None, precompute_distances='auto', random_state=19,
       tol=0.0001, verbose=0)


0.5259470165110727

In [15]:
# PCA 
# Reduce Dimensions
# PCA 
pca1 = PCA(n_components=150)
transformed_train = pca1.fit_transform(X_train_enc)

pca2 = PCA(n_components=150)
transformed_test = pca2.fit_transform(X_test_enc)

print(sum(pca1.explained_variance_ratio_))
print(sum(pca2.explained_variance_ratio_))
# print(pca1.n_components_)
# pca2.n_components_

0.902913327794522
0.9299359017750248


In [22]:
# PCA 
# Reduce Dimensions
# PCA 
pca1 = PCA(n_components=300)
transformed_train = pca1.fit_transform(X_train_enc)

pca2 = PCA(n_components=300)
transformed_test = pca2.fit_transform(X_test_enc)

print(sum(pca1.explained_variance_ratio_))
print(sum(pca2.explained_variance_ratio_))
# print(pca1.n_components_)
# pca2.n_components_

0.9701810850820038
0.9857587999867974


In [None]:
plotdiffTsne(transformed_test)