# Full Pipeline (on Asbestos) - Oct 2018
Created:  31 Oct 2018 <br>
Last update: 1 Nov 2018


### Goal: Run the full pipeline on the Asbestos set


<hr>
## 1. Imports

In [None]:
# this will remove warnings messages
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

# import
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

import imgutils

In [None]:
# Re-run this cell if you altered imgutils
import importlib
importlib.reload(imgutils)

<hr>
## 2. Data Definitions & Feature Specification

In [None]:
# Data:
datafolder = '../data/Asbestos_Aug30/SA_TileSet_Subset2_1K'
n_tiles_x = 2  # mostly for visualization
n_tiles_y = 2


# Features to use:
#feature_funcs = [imgutils.img_mean, imgutils.img_std, imgutils.img_median, 
#                 imgutils.img_mode,
#                 imgutils.img_kurtosis, imgutils.img_skewness]
feature_funcs = [imgutils.img_std, imgutils.img_relstd, imgutils.img_mean, 
                 imgutils.img_skewness,  imgutils.img_kurtosis, imgutils.img_mode, imgutils.img_range]
feature_names = imgutils.stat_names(feature_funcs)

# Size of the grid, specified as number of slices per image in x and y direction:
default_grid_x = 8
default_grid_y = default_grid_x

<hr>
## 3. Import Data & Extract Features

In [None]:
# image import:
print("Scanning for images in '{}'...".format(datafolder))
df_imgfiles = imgutils.scanimgdir(datafolder, '.tif')
imgfiles = list(df_imgfiles['filename'])
print("# of images: {} \n".format(len(imgfiles)))

# feature extraction:
print("Feature extraction...")
print("- Slicing up images in {} x {} patches. ".format(default_grid_y, default_grid_x))
print("- Extract statistics from each slice: {} ".format(', '.join(feature_names)))
print("...working...", end='\r')
df = imgutils.slicestats(imgfiles, default_grid_y, default_grid_x, feature_funcs)
print("# slices extracted: ", len(df))



<hr>
## 4. Machine Learning Pipeline

### Hyper parameters

In [None]:
# data hyper-parameters
default_n_clusters = 3

# algorithm hyper-parameters:
kmeans_n_init = 10

In [None]:
def run_ml_pipeline2(X, ml_name, ml_algorithm, standardize=True, use_pca=True, n_pca=None):
  
    # Setup algorithmic pipeline, including standardization
    pipeline = Pipeline([(ml_name, ml_algorithm)])
    
    # watch the order, pca should happen after scaling, but we insert at 0
    if (use_pca): 
        pipeline.steps.insert(0,('pca', PCA(n_components=n_pca)))
    if (standardize): 
        pipeline.steps.insert(0, ('scaling_{0}'.format(ml_name), StandardScaler()))
    
    # run the pipelines
    y = pipeline.fit_predict(X) # calls predict on last step to get the labels

    # report score:
    score = silhouette_score(X, y)
    
    return score, y

In [None]:
def run_ml_pipelines2(df_data, feature_cols, n_clusters, standardize=True, use_pca=True, n_pca=None):
    global kmeans_n_init
    
    X = df_data.loc[:,feature_cols]
    
    # Setup ML clustering algorithms:    
    kmeans = KMeans(algorithm='auto', n_clusters=n_clusters, n_init=kmeans_n_init, init='k-means++')
    agglomerative =  AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='complete')  

    # run the pipelines
    print("Executing clustering pipelines...")
    score_kmeans, y_kmeans = run_ml_pipeline2(X, 'kmeans', kmeans, standardize = standardize, use_pca = use_pca, n_pca=n_pca)
    score_hier, y_hier = run_ml_pipeline2(X, 'hierarchical', agglomerative, standardize = standardize, use_pca = use_pca, n_pca=n_pca)
    print("Done\n")
    
    # collect data
    df_data['kmeans']=y_kmeans
    df_data['hierarchical']=y_hier

    # report results:
    print("\nClustering Scores:")
    print("K-means: ", score_kmeans)
    print("Hierarchical: ", score_hier)
       

In [None]:
run_ml_pipelines2(df, feature_names, default_n_clusters, standardize=True, use_pca=True)

In [None]:
imgutils.showimgset(imgfiles, 2,2, fig_size=(12, 8), relspacing=(0.05,0.05))

In [None]:
df['dummy'] = 0
imgutils.show_large_heatmap(df, 'dummy', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(12,10))

In [None]:
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(12,10))

In [None]:
imgfile1 = imgfiles[2]
#df_heats1 = df[df['filename']==imgfile1]['kmeans']
#heats = np.reshape(df_heats1.values, (default_grid_y, default_grid_x))

subimgs, heats = imgutils.getimgslices_fromdf(df, imgfile1, 'kmeans')
heats = heats / np.max(heats)
imgutils.showheatmap(subimgs, heats, cmapname='Set1', heatdepend_opacity = False)

In [None]:
n_patches = 20
df2 = imgutils.slicestats([imgfile1], n_patches, n_patches, feature_funcs)

In [None]:
run_ml_pipelines2(df2, ['img_std', 'img_range'], 3, standardize=True, use_pca=True)

In [None]:
imgutils.show_large_heatmap(df2, 'kmeans', [imgfile1], n_rows=1, n_cols=1, fig_size=(14,14))

In [None]:
imgutils.showimgset(imgfiles, 2,2, fig_size=(12, 8), relspacing=(0.05,0.05))

### For nice graphs, run it once more with two clusters and two levels of granularity

In [None]:
def change_clusternums(df, columnname, oldnew_dict):
    df[columnname].replace(oldnew_dict, inplace=True)
    
def swap_clusters(df, columnname, clust1, clust2):
    oldnew_dict = { clust1: clust2, clust2: clust1}
    df[columnname].replace(oldnew_dict, inplace=True)

In [None]:
n_patches = 4
df_coarse = imgutils.slicestats([imgfile1], n_patches, n_patches, feature_funcs)
run_ml_pipelines2(df_coarse, feature_names, 2, standardize=True, use_pca=True)

In [None]:
swap_clusters(df_coarse, 'kmeans', 0, 1)

In [None]:
imgutils.show_large_heatmap(df_coarse, 'kmeans', [imgfile1], n_rows=1, n_cols=1, fig_size=(10,10))
imgutils.show_large_heatmap(df_coarse, 'kmeans', [imgfile1], n_rows=1, n_cols=1, fig_size=(10,10), heatdependent_opacity=True)

** Fine grained **

In [None]:
n_patches = 20
df_fine = imgutils.slicestats([imgfile1], n_patches, n_patches, feature_funcs)
run_ml_pipelines2(df_fine, feature_names, 2, standardize=True, use_pca=True)

In [None]:
#swap_clusters(df_fine, 'kmeans', 0, 1)

In [None]:
imgutils.show_large_heatmap(df_fine, 'kmeans', [imgfile1], n_rows=1, n_cols=1, fig_size=(10,10))
imgutils.show_large_heatmap(df_fine, 'kmeans', [imgfile1], n_rows=1, n_cols=1, fig_size=(10,10), heatdependent_opacity=True)

### Use other scoring

Adjusting the ml_pipeline to use silhouette scoring based on it's last transformation:
(later renamed the other ones to run_xxx2 to preserve them)

In [None]:
from sklearn.metrics import calinski_harabaz_score

def run_ml_pipeline(X, ml_name, ml_algorithm, standardize=True, use_pca=True, n_pca=None):
  
    # Setup 'manual' pipeline (not using sklearn pipeline as intermediates are needed)
    feat_data = X
    if (standardize): 
        standardizer = StandardScaler()
        X_norm = standardizer.fit_transform(X)     
        feat_data = X_norm
    if (use_pca):  
        pca = PCA(n_components=n_pca)
        X_pca = pca.fit_transform(feat_data)
        feat_data = X_pca
    
    # run the pipelines
    y = ml_algorithm.fit_predict(feat_data) # calls predict oto get the labels

    # report score:
    #score = silhouette_score(feat_data, y)
    score = calinski_harabaz_score(feat_data,y)
    
    return score, y

In [None]:
def run_ml_pipelines(df_data, feature_cols, n_clusters, standardize=True, use_pca=True, n_pca=None):
    global kmeans_n_init
    
    X = df_data.loc[:,feature_cols]
    
    # Setup ML clustering algorithms:    
    kmeans = KMeans(algorithm='auto', n_clusters=n_clusters, n_init=kmeans_n_init, init='k-means++')
    agglomerative =  AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='complete')  

    # run the pipelines
    print("Executing clustering pipelines...")
    score_kmeans, y_kmeans = run_ml_pipeline(X, 'kmeans', kmeans, standardize = standardize, use_pca = use_pca, n_pca=n_pca)
    score_hier, y_hier = run_ml_pipeline(X, 'hierarchical', agglomerative, standardize = standardize, use_pca = use_pca, n_pca=n_pca)
    print("Done\n")
    
    # collect data
    df_data['kmeans']=y_kmeans
    df_data['hierarchical']=y_hier

    # report results:
    print("\nClustering Scores:")
    print("K-means: ", score_kmeans)
    print("Hierarchical: ", score_hier)

In [None]:
run_ml_pipelines(df, feature_names, default_n_clusters, standardize=True, use_pca=True)

More consistent with previous results and imo it is indeed better to assess the algorithm on how good it could cluster after all pre-processing

## 5. Visualize results

In [None]:
imgutils.showimgset(imgfiles, 2,2, fig_size=(12, 8), relspacing=(0.05,0.05))

In [None]:
run_ml_pipelines(df, feature_names, default_n_clusters, standardize=True, use_pca=True)
s = (12,10)
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)
imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)

** Here Hierarchical outperforms kmeans **

Run it again without PCA and/pr normalization compare results

In [None]:
run_ml_pipelines(df, feature_names, default_n_clusters, standardize=True, use_pca=False)
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)
imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)

In [None]:
run_ml_pipelines(df, feature_names, default_n_clusters, standardize=False, use_pca=False)
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)
imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)

### On this dataset, not much difference between hierarchical and pca, with or without normalization 


## 6. Combine import and pipeline:

In [None]:
def import_data(imagefolder):
    df_imgfiles = imgutils.scanimgdir(imagefolder, '.tif')
    return list(df_imgfiles['filename'])  

def extract_features(imgfiles, feature_funcs, n_grid_rows, n_grid_cols):
    df = imgutils.slicestats(imgfiles, n_grid_rows, n_grid_cols, feature_funcs)
    feature_names = imgutils.stat_names(feature_funcs)
    return df, feature_names
    

In [None]:
def run_kmeans_pipeline(df_data, feature_cols, n_clusters, standardize=True, use_pca=True, n_pca= None):
    global kmeans_n_init
   
    ml_name="kmeans"
    ml_algorithm = KMeans(algorithm='auto', n_clusters=n_clusters, n_init=kmeans_n_init, init='k-means++')

    X = df_data.loc[:,feature_cols]    
    score, y = run_ml_pipeline(X, ml_name, ml_algorithm, standardize = standardize, use_pca = use_pca, n_pca=n_pca)
    df_data[ml_name]= y

    return score

def run_hierarchical_pipeline(df_data, feature_cols, n_clusters, standardize=True, use_pca=True, n_pca=None):

    ml_name="hierarchical"
    ml_algorithm =  AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='complete')  

    X = df_data.loc[:,feature_cols]    
    score, y = run_ml_pipeline(X, ml_name, ml_algorithm, standardize = standardize, use_pca = use_pca, n_pca=n_pca)
    df_data[ml_name]= y

    return score

In [None]:
def run_fullpipeline(imagefolder, n_image_rows, n_image_cols, 
                     n_grid_rows, n_grid_cols, feature_funcs, n_clusters, fig_size=(8,6), return_df = False):
    """
    Run the full pipeline from import to visualization.   
    """ 
    print("Working...\r")
    imgfiles = import_data(imagefolder)
    df, feature_names = extract_features(imgfiles, feature_funcs, n_grid_rows, n_grid_cols)
    print(feature_names)
    score_kmeans = run_kmeans_pipeline(df, feature_names, n_clusters, standardize=True, use_pca=True )
    score_hier = run_hierarchical_pipeline(df, feature_names, n_clusters, standardize=False, use_pca=False)

    print('Results:')
    print('Score k-means:', score_kmeans)
    print('Score hierarchical:', score_hier)
    
    print('Visualizing...')
    imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_image_rows, n_cols=n_image_cols, fig_size=fig_size)
    imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_image_rows, n_cols=n_image_cols, fig_size=fig_size)
    
    if return_df: return df
    

In [None]:
def run_fullpipeline_kmeans(imagefolder, n_image_rows, n_image_cols, 
                     n_grid_rows, n_grid_cols, feature_funcs, n_clusters, fig_size=(8,6), return_df = False):
    """
    Run the full pipeline from import to visualization.   
    """ 
    print("Working...\r")
    imgfiles = import_data(imagefolder)
    df, feature_names = extract_features(imgfiles, feature_funcs, n_grid_rows, n_grid_cols)
    print(feature_names)
    score_kmeans = run_kmeans_pipeline(df, feature_names, n_clusters, standardize=True, use_pca=True )

    print('Results:')
    print('Score k-means:', score_kmeans)
    
    print('Visualizing...')
    imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_image_rows, n_cols=n_image_cols, fig_size=fig_size)
    
    if return_df: return df
    

In [None]:
def run_fullpipeline_hierarchical(imagefolder, n_image_rows, n_image_cols, 
                     n_grid_rows, n_grid_cols, feature_funcs, n_clusters, fig_size=(8,6), return_df = False):
    """
    Run the full pipeline from import to visualization.   
    """ 
    print("Working...\r")
    imgfiles = import_data(imagefolder)
    df, feature_names = extract_features(imgfiles, feature_funcs, n_grid_rows, n_grid_cols)
    print(feature_names)
    score_hier = run_hierarchical_pipeline(df, feature_names, n_clusters, standardize=False, use_pca=False)

    print('Results:')
    print('Score hierarchical:', score_hier)
    
    print('Visualizing...')
    imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_image_rows, n_cols=n_image_cols, fig_size=fig_size)
    
    if return_df: return df
    

## 7. Try it out with different combinations of slices on a harder variant

In [None]:
datafolder = '../data/Asbestos_Aug30/SA_TileSet_Subset1_1K'

### 4x4 - 3 clusters

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 4, 4, feature_funcs, 3)

### 8x8, 3 clusters

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 8, 8, feature_funcs, 3)

### 20x20, 3 clusters

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 20, 20, feature_funcs, 3)

<hr>

## 8. Try it out with different number of clusters

### 2 clusters (4x4 , 10x10, 20x20)

In [None]:
run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 4, 4, feature_funcs, 2)

In [None]:
run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 10, 10, feature_funcs, 2)

In [None]:
run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 20, 20, feature_funcs, 2)

Look again at hierarchical with scaling and pca on:

In [None]:
print(feature_funcs)

no difference, all not good.  Let's try if we drop the mean and mode (see if ignores the black)

In [None]:
try_funcs =  [imgutils.img_std, imgutils.img_relstd, imgutils.img_skewness,imgutils.img_kurtosis]
run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 8, 8, try_funcs , 4)

# 4 clusters (4x4, 10x10, 20x20)

In [None]:
run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 4, 4, feature_funcs, 4)

In [None]:
run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 10, 10, feature_funcs, 4)

In [None]:
run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 20, 20, feature_funcs, 4)

** with smaller grid and 4 clusters, it starts to make sense **

In [None]:
run_fullpipeline_hierarchical(datafolder, n_tiles_y, n_tiles_x, 10, 10, feature_funcs, 4)

** AGAIN, THE BLACK TILES NEGATIVELY IMPACT RESULTS **

# Try two-step pipeline
## step 1: filter out black tiles
## step 2: cluster remaining tiles

### Parametrize:

In [None]:
n_clusters_step1 = 3
n_clusters_step2_kmeans = 2
n_clusters_step2_hierarchical = 2

n_patches_x = 10
n_patches_y = 10

### Feature extract

In [None]:
def change_clusternums(df, columnname, oldnew_dict):
    df[columnname].replace(oldnew_dict, inplace=True)
    
def swap_clusters(df, columnname, clust1, clust2):
    oldnew_dict = { clust1: clust2, clust2: clust1}
    df[columnname].replace(oldnew_dict, inplace=True)

In [None]:
# reset
df = df.drop(columns=['kmeans'])
df2 = None
df3 = None

In [None]:
imgfiles = import_data(datafolder)
df, feature_names = extract_features(imgfiles, feature_funcs, n_patches_y, n_patches_x)

In [None]:
imgutils.showimgset(imgfiles, 2,2, fig_size=(12, 8), relspacing=(0.05,0.05))

In [None]:
df['dummy']=0
imgutils.show_large_heatmap(df, 'dummy', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(12,12))
imgutils.show_large_heatmap(df, 'dummy', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(12,12), no_borders=True)

### Step 1: filter-out black tiles

In [None]:
_ = run_kmeans_pipeline(df, feature_names, n_clusters_step1, standardize=True, use_pca=True )

In [None]:
#swap_clusters(df,'kmeans', 2, 1)

In [None]:
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(12,12))

In [None]:
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(12,12), heatdependent_opacity=True, no_borders=True)

In [None]:
df['kmeans'].value_counts()

In [None]:
# cat_select = 1  
# we know for this it's the biggest set
i_max_count = df['kmeans'].value_counts()
cat_select = i_max_count.index[0]
print(cat_select)

In [None]:
df2 = df[df['kmeans']==cat_select]
df2.head(3)

In [None]:
score_kmeans = run_kmeans_pipeline(df2, feature_names, n_clusters_step2_kmeans, standardize=True, use_pca=True )
score_hierarch = run_hierarchical_pipeline(df2, feature_names, n_clusters_step2_hierarchical, standardize=False, use_pca=False )

In [None]:
df2['kmeans'].value_counts()

In [None]:
df2['hierarchical'].value_counts()

In [None]:
df2=df2.rename(columns = {'kmeans':'kmeans2', 'hierarchical':'hierarchical2'})

In [None]:
df2.head(3)

In [None]:
df3 = df.merge(df2, 'left')

In [None]:
df3.head(3)

In [None]:
df3['kmeans2'].fillna(value=-1, inplace=True)

In [None]:
df3['hierarchical2'].fillna(value=-1, inplace=True)

In [None]:
df3.head(3)

In [None]:
df3['heats']=df3['kmeans2']+1

In [None]:
df3['heats'].value_counts()

In [None]:
# make the whole 2 clusters only
df3['heats'].replace({1:0}, inplace=True)
df3['heats'].value_counts()

In [None]:
df3['heats2']=df3['hierarchical2']+1

In [None]:
df3['heats2'].value_counts()

In [None]:
# make the whole 2 clusters only
df3['heats2'].replace({1:0}, inplace=True)
df3['heats2'].value_counts()

In [None]:
imgutils.show_large_heatmap(df3, 'heats', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(16,12))
imgutils.show_large_heatmap(df3, 'heats', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(16,12), heatdependent_opacity=True, no_borders=True)

In [None]:
imgutils.show_large_heatmap(df3, 'heats2', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(16,12))

** Alternative: run one with 4 clusters and small patch-size **

In [None]:
df_direct4 = run_fullpipeline_kmeans(datafolder, n_tiles_y, n_tiles_x, 12, 12, feature_funcs, 4, return_df = True)

In [None]:
#swap_clusters(df_direct4, 'kmeans', 1,2)

In [None]:
imgutils.show_large_heatmap(df_direct4, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=(12,12), heatdependent_opacity=True, no_borders=True)

# Yet another set


In [None]:
datafolder_lm = '../data/Asbestos_Aug30/LM_TileSet_Subset_1K'
imgfiles_lm = import_data(datafolder_lm)

In [None]:
run_fullpipeline_kmeans(datafolder_lm, n_tiles_y, n_tiles_x, 10, 10, feature_funcs, 3, fig_size=(12,10))

In [None]:
run_fullpipeline_kmeans(datafolder_lm, n_tiles_y, n_tiles_x, 10, 10, feature_funcs, 4,fig_size=(12,10))

In [None]:
run_fullpipeline_kmeans(datafolder_lm, n_tiles_y, n_tiles_x, 20, 20, feature_funcs, 4, fig_size=(12,10))

## 2 step approach on this set

** Parametrize **

In [None]:
n_clusters_step1 = 3
n_clusters_step2_kmeans = 2
n_clusters_step2_hierarchical = 2

n_patches_x = 20
n_patches_y = 20

n_tiles_x3 = 2
n_tiles_y3 = 2

datafolder3= '../data/Asbestos_Aug30/LM_TileSet_Subset_1K'

In [None]:
# reset
df = df.drop(columns=['kmeans'])
df2 = None
df3 = None

** 1. Filter black tiles **

In [None]:
imgfiles3 = import_data(datafolder3)

In [None]:
df, feature_names = extract_features(imgfiles3, feature_funcs, n_patches_y, n_patches_x)
print(len(imgfiles))

In [None]:
df['dummy']=0
imgutils.show_large_heatmap(df, 'dummy', imgfiles3, n_rows=n_tiles_y3, n_cols=n_tiles_x3, fig_size=(12,12))
imgutils.show_large_heatmap(df, 'dummy', imgfiles3, n_rows=n_tiles_y3, n_cols=n_tiles_x3, fig_size=(12,12), no_borders=True)

In [None]:
_ = run_kmeans_pipeline(df, feature_names, n_clusters_step1, standardize=True, use_pca=True )

In [None]:
#swap_clusters(df,'kmeans', 2, 1)

In [None]:
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y3, n_cols=n_tiles_x3, fig_size=(12,12))

In [None]:
df['kmeans'].value_counts()

In [None]:
# cat_select = 1  
# we know for this it's the biggest set
i_max_count = df['kmeans'].value_counts()
cat_select = i_max_count.index[0]
print(cat_select)
df2 = df[df['kmeans']==cat_select]
df2.head(3)

In [None]:
score_kmeans = run_kmeans_pipeline(df2, feature_names, n_clusters_step2_kmeans, standardize=True, use_pca=True )
score_hierarch = run_hierarchical_pipeline(df2, feature_names, n_clusters_step2_hierarchical, standardize=False, use_pca=False )

In [None]:
df2=df2.rename(columns = {'kmeans':'kmeans2', 'hierarchical':'hierarchical2'})
df3 = df.merge(df2, 'left')
df3['kmeans2'].fillna(value=-1, inplace=True)
df3['hierarchical2'].fillna(value=-1, inplace=True)
df3['heats']=df3['kmeans2']+1
df3['heats2']=df3['hierarchical2']+1

In [None]:
df3['heats'].value_counts()

In [None]:
# make the whole 2 clusters only
df3['heats'].replace({1:0}, inplace=True)
df3['heats'].value_counts()

# make the whole 2 clusters only
df3['heats2'].replace({1:0}, inplace=True)
df3['heats2'].value_counts()

In [None]:
imgutils.show_large_heatmap(df3, 'heats', imgfiles3, n_rows=n_tiles_y3, n_cols=n_tiles_x3, fig_size=(16,12))
imgutils.show_large_heatmap(df3, 'heats2', imgfiles3, n_rows=n_tiles_y3, n_cols=n_tiles_x3, fig_size=(16,12))

### after filtering out the black ones, hierarchical clustering worked better than kmeans 

** This set needs smaller patches **