# Tree-based Methods Parameter Creation
Create dataframes of color differences from input images

In [1]:
import numpy as np
import pandas as pd
import os
import cv2
from time import time
import matplotlib.pyplot as plt

import scipy
from scipy.special import factorial

import skimage
from skimage.color import rgb2lab, deltaE_cie76

import collections 
from collections import Counter

import sklearn 
from sklearn.cluster import KMeans 
from sklearn.metrics import pairwise_distances_argmin
from sklearn.datasets import load_sample_image
from sklearn.utils import shuffle

# Run in google colab because images are in shared drive
#from google.colab import drive
#drive.mount('/content/drive')

In [8]:
# Basic helper functions

def column(matrix, i):
    ''' Take the column at index i in array and make it into own array'''
    return [row[i] for row in matrix]

def get_image(image_path):
    ''' Read in an image from a path and create it with RGB value'''
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image

In [9]:
def recreate_image(codebook, labels, w, h, kmeans):
    """Recreate the (compressed) image from the code book & labels"""
    quant_img = codebook[labels.reshape(w, h)]
    
    return quant_img


def diff_find(colors):
    num_colors = colors.shape[0]

    pixel_dist = np.expand_dims(colors, [-2]) - colors
    pixel_dist = np.sqrt((pixel_dist ** 2).sum(-1))
    
    return pixel_dist[np.triu_indices(num_colors, k=1)]


def count_pixels(image, show_graph, num_colors):
    ''' use color_quantization and recreate_image to count the pixels of a specified number of colors in an image'''
    
    w, h, d, kmeans, labels = color_quantization(image, num_colors)
    image = recreate_image(kmeans.cluster_centers_, labels, w, h, kmeans)

    # Turn image into an array and find all colors in it with associated counts
    imarray = np.array(image)
    
    # Turn back into RGB int array
    imarray = imarray * 255
    imarray = imarray.astype(int)
    
    colors, counts = np.unique(imarray.reshape(-1,3), axis=0, return_counts=1)

    while(np.shape(colors)[0] < num_colors):
      #print("ERROR")
      colors = np.concatenate((np.array(colors), [colors[0]]), axis=0)


    diff_arr = diff_find(colors)
            
    if (show_graph):
        
        fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(15,17))
        ax1.imshow(image)
        ax2.imshow([background])
        ax3.imshow([flake])
        ax4.bar(hex_colors, counts, color = hex_colors)

    return counts, colors, diff_arr 

In [7]:
## COLOR QUANTIZATION ##
## BETTER METHOD FOR DENOISING THE DATA ##

# Authors: Robert Layton <robertlayton@gmail.com>
#          Olivier Grisel <olivier.grisel@ensta.org>
#          Mathieu Blondel <mathieu@mblondel.org>
#
# License: BSD 3 clause

def color_quantization(image, num_colors):
    '''Returns the w, h, and d of the color quantized image after input'''
    n_colors = num_colors
    
    # Convert to floats and divide by 255 for plt.imshow
    image = np.array(image, dtype=np.float64) / 255

    # Load Image and transform to a 2D numpy array.
    w, h, d = original_shape = tuple(image.shape)
    assert d == 3
    image_array = np.reshape(image, (w * h, d))
    
    #print("Fitting model on a small sub-sample of the data")
    t0 = time()
    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
    #print("Fitting done in %0.3fs." % (time() - t0))

    # Get labels for all points
    #print("Predicting color indices on the full image (k-means)")
    t0 = time()
    labels = kmeans.predict(image_array)
    #print("Predicting done in %0.3fs." % (time() - t0))

    codebook_random = shuffle(image_array, random_state=0)[:n_colors]
    #print("Predicting color indices on the full image (random)")
    #t0 = time()
    labels_random = pairwise_distances_argmin(codebook_random,
                                          image_array,
                                          axis=0)
    #print("done in %0.3fs." % (time() - t0))
    
    return w, h, d, kmeans, labels

In [6]:
def color_diff_param (df_in, num_colors_in):
    '''Create a dataframe with columns as ranges that the color difference can fall into and each row the number of 
    differences in that picture that are in that range (range of the column)'''
    df = df_in.copy()

    ## Find the color differences
    num_colors = num_colors_in

    total_diff = np.zeros((df.shape[0], int((num_colors * (num_colors - 1))/2) ))
    for index, path in enumerate (df['paths']):
        pixels_array, colors, diff = count_pixels(get_image(path), False, num_colors)
        if index % 100 == 0:
            print("still working ....", index)
        total_diff[index] = diff

    #np.save('total_diff_' + str(num_colors), total_diff)

    ## Create array where each row is a different image and each column is a range ##
    ## Each value represents the number of color differences in that image that fall into that range ##
    ranges = np.array([[np.min(total_diff),5],[5, 10],[10, 20],[20, 30],[30, 40],[40, 50],[50,60],[60,70],[70,80],[80, 90],[90, 100],[100,110],[110,120],[120,130],[130,140],[140,180],[180, np.max(total_diff)]])

    ranges_all = []
    for small_diff in total_diff:    
        for range_arr in ranges:
            found_arr = np.where((small_diff >= range_arr[0]) & (small_diff <= range_arr[1]))
            ranges_all.append(np.shape(found_arr)[1])
    ranges_all = np.array(ranges_all).reshape((df.shape[0], np.shape(ranges)[0])) 


    ## Add Ranges to DataFrame
    for index, name in enumerate (ranges): 
        df[str(name)] = column(ranges_all, index)

    return df

In [None]:
# Augmented pickle
data = pd.read_pickle(os.path.join('/content/drive/Shared drives/2d/data', 'pad_augment_data_final.pkl'))
data['paths'] = data['paths'].apply(lambda x: '/content/drive/Shared drives/2d/data/pad_augment_data_final/' + x.split('/')[-1])
data

In [None]:
colors_array = np.array([256])
data_mini = data[7000:9000]
## Find difference between every color in each color array associated with each picture ##
for x in colors_array:  
    df_out = color_diff_param(data_mini, x)
    #df_out.to_pickle('/content/drive/Shareddrives/2d/Tree_features/aug_run_'+str(x)+'.pkl')
      print('Finished: '+ str(x))