# Full Pipeline (on Tileset7) - Aug 2017
Created:  21 Aug 2018 <br>
Last update: 24 Aug 2018


### Goal: Combine the relevant steps from data import to unsupervised learning 

Many functions have gradually been developed in the prior notebooks (and added to 'imgutils'). In this notebook, the steps will be combined without all the intermediate analysis.


<hr>
## 1. Imports

In [None]:
# this will remove warnings messages
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

# import
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

import imgutils

In [None]:
# Re-run this cell if you altered imgutils
import importlib
importlib.reload(imgutils)

<hr>
## 2. Data Definitions & Feature Specification

In [None]:
# Data:
datafolder = '../data/Crystals_Apr_12/Tileset7'
n_tiles_x = 3  # mostly for visualization
n_tiles_y = 2


# Features to use:
#feature_funcs = [imgutils.img_mean, imgutils.img_std, imgutils.img_median, 
#                 imgutils.img_mode,
#                 imgutils.img_kurtosis, imgutils.img_skewness]
feature_funcs = [imgutils.img_std, imgutils.img_relstd, imgutils.img_mean, 
                 imgutils.img_skewness,  imgutils.img_kurtosis, imgutils.img_mode]
feature_names = imgutils.stat_names(feature_funcs)

# Size of the grid, specified as number of slices per image in x and y direction:
n_rows = 4
n_cols = n_rows

<hr>
## 3. Import Data & Extract Features

In [None]:
# image import:
print("Scanning for images in '{}'...".format(datafolder))
df_imgfiles = imgutils.scanimgdir(datafolder, '.tif')
imgfiles = list(df_imgfiles['filename'])
print("# of images: {} \n".format(len(imgfiles)))

# feature extraction:
print("Feature extraction...")
print("- Slicing up images in {} x {} patches. ".format(n_rows, n_cols))
print("- Extract statistics from each slice: {} ".format(', '.join(feature_names)))
print("...working...", end='\r')
df = imgutils.slicestats(imgfiles, n_rows, n_cols, feature_funcs)
print("# slices extracted: ", len(df))



<hr>
## 4. Machine Learning Pipeline

### Hyper parameters

In [None]:
# data hyper-parameters
n_clusters = 3
n_important_features = len(feature_names)

# algorithm hyper-parameters:
kmeans_n_init = 10
pca_n_components = None   # i.e. all

In [None]:
def run_ml_pipeline(X, ml_name, ml_algorithm, standardize=True, use_pca=True):
    global pca_n_components
  
    # Setup algorithmic pipeline, including standardization
    pipeline = Pipeline([(ml_name, ml_algorithm)])
    
    # watch the order, pca should happen after scaling, but we insert at 0
    if (use_pca): 
        pipeline.steps.insert(0,('pca', PCA(n_components=pca_n_components)))
    if (standardize): 
        pipeline.steps.insert(0, ('scaling_{0}'.format(ml_name), StandardScaler()))
    
    # run the pipelines
    y = pipeline.fit_predict(X) # calls predict on last step to get the labels

    # report score:
    score = silhouette_score(X, y)
    
    return score, y

def run_ml_pipelines(df_data, feature_cols, n_clust = n_clusters, standardize=True, use_pca=True):
    global pca_n_components, kmeans_n_init
    
    X = df_data.loc[:,feature_cols]
    
    # Setup ML clustering algorithms:    
    kmeans = KMeans(algorithm='auto', n_clusters=n_clust, n_init=kmeans_n_init, init='k-means++')
    agglomerative =  AgglomerativeClustering(n_clusters=n_clust, affinity='euclidean', linkage='complete')  

    # run the pipelines
    print("Executing clustering pipelines...")
    score_kmeans, y_kmeans = run_ml_pipeline(X, 'kmeans', kmeans, standardize = standardize, use_pca = use_pca)
    score_hier, y_hier = run_ml_pipeline(X, 'hierarchical', agglomerative, standardize = standardize, use_pca = use_pca)
    print("Done\n")
    
    # collect data
    df_data['kmeans']=y_kmeans
    df_data['hierarchical']=y_hier

    # report results:
    print("\nClustering Scores:")
    print("K-means: ", score_kmeans)
    print("Hierarchical: ", score_hier)
       

In [None]:
run_ml_pipelines(df, feature_names, standardize=True, use_pca=True)

### Why do these scores deviate from previous notebook?  Is there something going on with the sklearn pipeline or StandardScaler?


<hr>
## 5. Investigated scoring issue in other notebook

(see realxtals1-pipeline_scoring_issues)

Nothing was wrong with the pipeline or original step-by-step implementation. It was caused by different basis for the score calculation (pipeline impl. is using the original data while the step-by-step is using the normalized data for looking at the 'internal clustering score').

As such, turning normalization off (in the pipeline) gives higher scores. This does not mean the outcome is better, that needs visual inspection.

## 5. Visualize results

In [None]:
s = (8,6)
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)
imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)

Run it again without normalization and PCA and compare results

In [None]:
run_ml_pipelines(df, feature_names, standardize=False, use_pca=False)
imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)
imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_tiles_y, n_cols=n_tiles_x, fig_size=s)

### On this dataset, hierarchical works slightly better without normalization (!?!)

Hence, the method that runs two pipelines is less useful as they need different parameterization. So let's make two methods below that run the full pipeline including slicing

## 6. Combine import and pipeline:

In [None]:
def import_data(imagefolder):
    df_imgfiles = imgutils.scanimgdir(datafolder, '.tif')
    return list(df_imgfiles['filename'])  

def extract_features(imgfiles, feature_funcs, n_grid_rows, n_grid_cols):
    df = imgutils.slicestats(imgfiles, n_grid_rows, n_grid_cols, feature_funcs)
    feature_names = imgutils.stat_names(feature_funcs)
    return df, feature_names
    

In [None]:
def run_kmeans_pipeline(df_data, feature_cols, n_clust = n_clusters, standardize=True, use_pca=True):
    global kmeans_n_init
   
    ml_name="kmeans"
    ml_algorithm = KMeans(algorithm='auto', n_clusters=n_clust, n_init=kmeans_n_init, init='k-means++')

    X = df_data.loc[:,feature_cols]    
    score, y = run_ml_pipeline(X, ml_name, ml_algorithm, standardize = standardize, use_pca = use_pca)
    df_data[ml_name]= y

    return score

def run_hierarchical_pipeline(df_data, feature_cols, n_clust = n_clusters, standardize=True, use_pca=True):

    ml_name="hierarchical"
    ml_algorithm =  AgglomerativeClustering(n_clusters=n_clust, affinity='euclidean', linkage='complete')  

    X = df_data.loc[:,feature_cols]    
    score, y = run_ml_pipeline(X, ml_name, ml_algorithm, standardize = standardize, use_pca = use_pca)
    df_data[ml_name]= y

    return score

In [None]:
def run_fullpipeline(imagefolder, n_image_rows, n_image_cols, 
                     n_grid_rows, n_grid_cols, feature_funcs, n_clusters):
    """
    Run the full pipeline from import to visualization.   
    """ 
    print("Working...\r")
    imgfiles = import_data(imagefolder)
    df, feature_names = extract_features(imgfiles, feature_funcs, n_grid_rows, n_grid_cols)
    score_kmeans = run_kmeans_pipeline(df, feature_names, n_clust=n_clusters, standardize=True, use_pca=True )
    score_hier = run_hierarchical_pipeline(df, feature_names, n_clust=n_clusters, standardize=False, use_pca=False)

    print('Results:')
    print('Score k-means:', score_kmeans)
    print('Score hierarchical:', score_hier)
    
    print('Visualizing...')
    s = (8,6)
    imgutils.show_large_heatmap(df, 'kmeans', imgfiles, n_rows=n_image_rows, n_cols=n_image_cols, fig_size=s)
    imgutils.show_large_heatmap(df, 'hierarchical', imgfiles, n_rows=n_image_rows, n_cols=n_image_cols, fig_size=s)
    

## 7. Try it out with different combinations of slices

### 4x4 - 3 clusters

In [None]:

run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 4, 4, feature_funcs, 3)

### 6x6, 3 clusters

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 6, 6, feature_funcs, 3)

### 8x8, 3 clusters

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 8, 8, feature_funcs, 3)

<hr>

## 8. Try it out with different number of clusters

### 2 clusters (4x4 , 6x6, 8x8)

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 4, 4, feature_funcs, 2)

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 6, 6, feature_funcs, 2)

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 8, 8, feature_funcs, 2)

### 4 clusters (4x4, 6x6, 8x8)

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 4, 4, feature_funcs, 4)

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 6, 6, feature_funcs, 4)

In [None]:
run_fullpipeline(datafolder, n_tiles_y, n_tiles_x, 8, 8, feature_funcs, 4)

### Need grid search versions...