In [95]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd

from utils.dataset import get_train_test_data, get_data_masks

# Read the featuremaps clusters data
featuremaps_df = pd.read_pickle('../logs/feature_combinations_clusters')
# Compute the total time
featuremaps_df['time'] = featuremaps_df['map_time'] + featuremaps_df['features_extraction']
featuremaps_df = featuremaps_df.drop(columns=['map_time', 'features_extraction'])

# Filter for the desired approach
featuremaps_df = featuremaps_df[
    # (featuremaps_df['map_size']  == '10x10') &
    (featuremaps_df['mode'] == 'original')
]

featuremaps_df['approach'] = featuremaps_df.apply(
    lambda row: f'{row["approach"]}({row["map_size"]})_{row["mode"]}',
    axis=1
)

# Read the heatmaps data
heatmaps_df = pd.read_pickle('../logs/heatmaps_data')

# Drop the values with null silhouette score
heatmaps_df = heatmaps_df.dropna(subset=['silhouette']).reset_index(drop=True)
# Keep the column of interest
heatmaps_df = heatmaps_df[['clustering_mode', 'explainer', 'clustering_technique', 'clusters', 'time_clustering', 'time_contributions']]
heatmaps_df = heatmaps_df.rename(columns={'explainer': 'approach'})
# Merge the information for the clustering mode
heatmaps_df['clustering_mode'] = heatmaps_df.apply(lambda row: f'{row["clustering_technique"]}({row["clustering_mode"]})', axis=1)
heatmaps_df = heatmaps_df.drop(columns=['clustering_technique'])
# Compute the total time
heatmaps_df['time'] = heatmaps_df['time_clustering'] + heatmaps_df['time_contributions']
heatmaps_df = heatmaps_df.drop(columns=['time_clustering', 'time_contributions'])

heatmaps_df.head()

# Merge all the clusters together
complete_df = pd.concat([featuremaps_df, heatmaps_df]).reset_index(drop=True)

# Extract the data about the number of clusters
complete_df['num_clusters'] = complete_df['clusters'].apply(len)
# Extract data about the clusters sizes
complete_df['clusters_sizes'] = complete_df['clusters'].apply(lambda clusters: [len(cluster) for cluster in clusters])

# Get the indexes of the misclassified elements
(train_data, train_labels), (test_data, test_labels) = get_train_test_data(rgb=True)
predictions = np.loadtxt('../in/predictions.csv')
mask_miss, mask_label = get_data_masks(test_labels, predictions, label=5)
mask_miss_label = mask_miss[mask_label]
misclassified_idxs = np.argwhere(mask_miss_label == True)
# Find the fraction of misclassified data in each cluster
complete_df['frac_misses'] = complete_df['clusters'].apply(
    lambda clusters: [
        len([entry for entry in cluster if entry in misclassified_idxs]) / len(cluster)
        for cluster in clusters
    ]
)
# Find the fraction of clusters containing both correct and incorrect classifications
complete_df['frac_mixed'] = complete_df['frac_misses'].apply(lambda misses: len([entry for entry in misses if 0 < entry < 1]) / len(misses))

2022-05-03 17:36:54.182722: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [150]:

from utils.general import get_balanced_samples
from utils.cluster.postprocessing import get_misses_count, get_labels_purity

# Get the clusters for the selected approach
clusters = np.array(complete_df.set_index('approach').loc['GradCAM'].iloc[0]['clusters'], dtype=list)
# Find the count of misclassified entries in each cluster
counts_misses = np.vectorize(lambda cluster: get_misses_count(cluster, predictions=predictions))(clusters)
# Find the purity and impurity of each cluster
purities = np.vectorize(lambda cluster: get_labels_purity(cluster, predictions=predictions)) (clusters)
# Weight the purity and impurity based on the count of misclassified elements in log scale
counts_misses_log = np.vectorize(lambda val: 0 if val == 0 else np.log(val))(counts_misses)

pure_sample, impure_sample = get_balanced_samples(clusters, sample_size=5, balanced_by=purities, weights=counts_misses_log)

pure_sample, impure_sample

(array([list([513, 516, 133, 138, 525, 526, 528, 657, 658, 148, 532, 790, 23, 538, 669, 160, 681, 433, 51, 53, 54, 438, 568, 61, 445, 701, 834, 207, 209, 86, 598, 729, 608, 482, 231, 487, 620, 114, 115, 761, 120, 377, 378, 764]),
        list([769, 130, 262, 519, 137, 394, 395, 396, 523, 19, 147, 24, 156, 540, 286, 541, 542, 162, 164, 37, 548, 167, 168, 41, 297, 550, 551, 46, 175, 305, 562, 691, 188, 191, 66, 68, 198, 455, 78, 206, 81, 212, 217, 103, 362, 364, 237, 118, 247, 505, 125]),
        list([772, 263, 520, 266, 524, 13, 270, 527, 531, 404, 534, 415, 40, 169, 302, 690, 311, 312, 567, 314, 697, 67, 200, 210, 600, 480, 739, 100, 488, 621, 110, 238, 239, 368, 495, 622, 372, 373, 374, 119, 633, 634, 251, 763, 127])],
       dtype=object),
 array([list([384, 132, 264, 648, 522, 271, 401, 402, 403, 530, 22, 407, 535, 539, 413, 417, 807, 43, 439, 184, 186, 319, 577, 196, 709, 73, 844, 335, 213, 344, 89, 218, 728, 223, 864, 610, 358, 870, 233, 873, 236, 880, 116, 117, 885, 383]),
     