In [None]:
import os
import random
from skimage import io, color, exposure
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
import os
os.environ["OMP_NUM_THREADS"] = '1'

BLU = 0
GRN = 1
RED = 2
GRY = 3
PUBLIC_DATASET_FOLDER = 'dataset2'

In [None]:
def get_image_filenames(folder=PUBLIC_DATASET_FOLDER, short=False):
    '''
      Parameter:
        folder specifies a folder name containing the image dataset, a string
            short specifies whether the returned filenames are full-path (by
            default) or short filenames, a boolean.
      Returns:
        img_filenames specifies a list of image filenames, a list.
      Does:
        Collect all the image filenames under the specified folder.
    '''

    img_filenames = []
    for root, _, files in os.walk(folder):
        for f in files:
            f = f.lower()
            if f.endswith('.jpg') or f.endswith('.jpeg'):
                if not short:
                    img_path = os.path.join(root, f)
                    img_filenames.append(img_path)
                else:
                    img_filenames.append(f)
    return img_filenames

In [None]:
def get_image_histograms(img_filenames, channel=RED, normalize=True):
    '''
      Parameter:
        img_filenames specifies a list of image filenames, a list.
        channel specifies which color channel or gray scale is applied to form
            the histogram, an integer.
        normalize specifies whether the frequency of the returned histogram is
            normalized to [0, 1] (by default) or not, a boolean.
      Returns:
        hists stores histograms in a dictionary, in which each key and value
            pair corresponds to a pair of an image filename and its histogram,
            a dict.
      Does:
        Collect all the histograms for each image filename according to the
            specific color channel or gray scale.
    '''
    # Dictionary to store histograms
    hists = {}
    for img_filename in img_filenames:
        print(img_filename)
        # Read image
        img = io.imread(img_filename)

        # Calculate histogram
        if channel == RED:
            hist = exposure.histogram(img[:,:,RED], normalize=normalize, source_range='dtype')[0]
        elif channel == GRN:
            hist = exposure.histogram(img[:,:,GRN], normalize=normalize, source_range='dtype')[0]
        elif channel == BLU:
            hist = exposure.histogram(img[:,:,BLU], normalize=normalize, source_range='dtype')[0]
        else:
            # Convert to grayscale
            img_gray = color.rgb2gray(img)
            hist = exposure.histogram(img_gray, normalize=normalize)[0]

        # Save histogram
        hists[img_filename] = hist
    return hists

In [None]:
def get_sim_matrix(img_filenames, features, normalize=True, round=True):
    '''
      Parameter:
        img_filenames specifies a list of image filenames, a list.
        features specifies a dictionary of image features, where each key
            and value pair corresponds to a pair of an image filename and
            its feature, a dict.
        normalize specifies whether the similarity matrix is normalized to
            [0, 1] (by default) or not, a boolean.
        round specifies whether the entries in the similarity matrix are
            rounded (by default) or not, a boolean.
      Returns:
        sim_matrix stores a similarity matrix, in which each entry at position
            (i, j) is computed as the Euclidean distance between the image
            feature i and the image feature j, a numpy.
      Does:
        Compute a similarity matrix based on the procedure described below.
            For each entry located at (i, j) in the matrix, we first get the
            filenames of the images i and j. The corresponding features i and j
            of the two images are obtained through access to the dictionary
            features. Next, we compute their feature similarity based on the
            definition of Euclidean distance. As a result, the larger value of
            the matrix entry's value indicates the less similarity between the
            two images in terms of the specific feature representations.
            According to the above process, it is obvious that the matrix is
            symmetric since the distance between images i and j is the same
            as the distance between images j and i.
    '''
    # Calculate all pairwise similarities
    num_images = len(features)
    sim_matrix = np.zeros((num_images, num_images))

    for i in range(num_images):
        for j in range(i, num_images):
            if j > i:
                feature_i = features[img_filenames[i]]
                feature_j = features[img_filenames[j]]

                # Measure histogram distance based on Euclidean distance
                if feature_j.size == feature_i.size:
                    sim = np.linalg.norm(feature_i - feature_j)
                    sim_matrix[i, j] = sim
                    sim_matrix[j, i] = sim
    if normalize:
        sim_matrix = (sim_matrix-np.min(sim_matrix))/(np.max(sim_matrix)-np.min(sim_matrix))
    if round:
        sim_matrix = np.round(sim_matrix, 4)

    return sim_matrix

In [None]:
def vis_sim_matrix(sim_matrix, keys_short, title='Image Similarity Matrix', filename=None):
    '''
      Parameter:
        sim_matrix specifies a similarity matrix, a numpy.
        keys_short specifies a list of image filenames in short format,a list.
        title specifies the title of the plot of the similarity matrix, a
            string. By default, title='Image Similarity Matrix.'
        filename specifies the filename to store the plot as an image, a string.
            By default, filename=None indicates no need to store the plot as an
            image.
      Returns:
        None.
      Does:
        Visualize the similarity matrix based on the procedure described below.
            Regarding the similarity matrix as 2D scalar data, we invoke the
            function imshow, provided by matplotlib, to render a (default)
            pseudocolor image, in which a darker pseudocolor indicates much
            similarity, while a lighter pseudocolor corresponds to less
            similarity. In addition, when rendering the image, for each entry
            located at (i, j) in the matrix, its similarity value is also
            used to for annotation. As for axes' labels, the short version
            of image filename is used to label ticks for both horizontal and
            vertical axes. To plot the image in compact format,  we rotate
            the labels of the horizontal axis by 45 degrees.
    '''
    # Visualize similarity matrix
    fig, ax = plt.subplots(figsize=(50, 50))
    im = ax.imshow(sim_matrix)

    # Label axes
    ax.set_xticks(np.arange(len(keys_short)), labels=keys_short)
    ax.set_yticks(np.arange(len(keys_short)), labels=keys_short)

    # Rotate labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Annotate matrix values
    for i in range(len(keys_short)):
        for j in range(len(keys_short)):
            ax.text(j, i, sim_matrix[i, j], ha="center", va="center", color="w")

    ax.set_title(title)
    fig.tight_layout()
    if filename is not None:
        plt.savefig(filename)
    plt.show()

In [None]:
def concatenate_histograms(hists_red, hists_grn, hists_blu):
    '''
      Parameter:
        hists_red, hists_grn, hists_blu specify dictionaries containing histograms
            for the red, green, and blue channels respectively, dict.
      Returns:
        concatenated_hists stores concatenated histograms in a dictionary, in which
            each key and value pair corresponds to a pair of an image filename and
            its concatenated histogram, a dict.
      Does:
        Concatenate histograms from the red, green, and blue channels for each image.
    '''
    concatenated_hists = {}
    for key in hists_red:
        concatenated_hists[key] = np.concatenate((hists_red[key], hists_grn[key], hists_blu[key]))
    return concatenated_hists


In [None]:
def sort_filenames_by_cluster(filenames, predictions):
    '''
      Parameter:
        filenames specifies a list of image filenames, a list.
        predictions specifies a list of cluster labels, a list.
      Returns:
        sorted_filenames specifies a list of image filenames sorted by cluster labels, a list.
      Does:
        Sort the image filenames based on the cluster labels.
    '''
    # Create a list of tuples (filename, cluster label)
    filename_cluster_pairs = list(zip(filenames, predictions))

    # Sort the list of tuples based on the cluster labels
    sorted_filename_cluster_pairs = sorted(filename_cluster_pairs, key=lambda x: x[1])

    # Extract the sorted filenames
    sorted_filenames = [pair[0] for pair in sorted_filename_cluster_pairs]

    return sorted_filenames

In [None]:
filenames = get_image_filenames(PUBLIC_DATASET_FOLDER)
filenames_short = get_image_filenames(PUBLIC_DATASET_FOLDER, short=True)

In [None]:
filenames_short

In [None]:
# Pair the filenames and filenames_short
paired_filenames = list(zip(filenames, filenames_short))

# Shuffle the pairs
random.shuffle(paired_filenames)

In [None]:
# Unzip the pairs back into two lists
filenames, filenames_short = zip(*paired_filenames)

In [None]:
filenames

In [None]:
# Convert the tuples back to lists
filenames = list(filenames)
filenames_short = list(filenames_short)

In [None]:
hists_red = get_image_histograms(filenames, channel=RED)
hists_grn = get_image_histograms(filenames, channel=GRN)
hists_blu = get_image_histograms(filenames, channel=BLU)

In [None]:
for key in hists_red:
    print(hists_red[key].shape)

In [None]:
concatenated_hists = concatenate_histograms(hists_red, hists_grn, hists_blu)

stacked_hists = np.vstack(list(concatenated_hists.values()))
stacked_hists.shape

In [None]:
sim_matrix = get_sim_matrix(filenames, concatenated_hists)
vis_sim_matrix(sim_matrix, filenames_short, filename='sim_matrix_orig.png')

In [None]:
# Perform KMeans clustering
kmeans = KMeans(n_clusters=15, random_state=42).fit(stacked_hists)

# Get the prediction results
predictions = kmeans.predict(stacked_hists)
print(predictions)

In [None]:
# Extract the sorted filenames
sorted_filenames = sort_filenames_by_cluster(filenames, predictions)
sorted_filenames_short = sort_filenames_by_cluster(filenames_short, predictions)

# Print sorted filenames
for filename in sorted_filenames:
    print(filename)

In [None]:
hists_red = get_image_histograms(sorted_filenames, channel=RED)
hists_grn = get_image_histograms(sorted_filenames, channel=GRN)
hists_blu = get_image_histograms(sorted_filenames, channel=BLU)

concatenated_hists = concatenate_histograms(hists_red, hists_grn, hists_blu)

In [None]:
sim_matrix = get_sim_matrix(sorted_filenames, concatenated_hists)
vis_sim_matrix(sim_matrix, sorted_filenames_short, filename='sim_matrix_kmeans.png')