# 1. Data Exploration and feature extraction

## Loading Image Dataset

From the documentation we see that:
- image pixel values RGB in the range [0,1], following the common image input conventions. 
- image size fixed to 224 x 224 pixels

We use this information to create a ImageDataGenerator to load the images from the provided folder structure. This will also process and return the categorical labels for each image.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define image size and scaling
image_size = (224, 224)
batch_size = 32

datagen = ImageDataGenerator(rescale=1./255)  # Adjust rescaling if needed
print('Training set:')
trainset = datagen.flow_from_directory('./train', target_size=image_size, batch_size=batch_size, shuffle=False)
print('Validation set:')
validset = datagen.flow_from_directory('./valid', target_size=image_size, batch_size=batch_size, shuffle=False)
print('Testing set:')
testset = datagen.flow_from_directory('./test', target_size=image_size, batch_size=batch_size, shuffle=False)


In [None]:
class_labels = list(validset.class_indices.keys())
print(f'The classificaion labels are: \n {class_labels}')

## Plotting of few images

Next we print 5 images from the training set for each category. For this we iterate over the image generator.

In [None]:
import matplotlib.pyplot as plt

def plot_samples(dataset, category):

    # we reset the iterator to make sure we cover all images    
    dataset.reset()

    fig, axes = plt.subplots(1, 5, figsize=(15, 3))
    fig.suptitle(category)
    
    k=0
    while k<5:

        images, labels = next(dataset)
    
        for n,label in enumerate(labels):
            if label[class_labels.index(category)] == 1:
                axes[k].imshow(images[n]) 
                axes[k].axis('off')
                k+=1
                if k==5:
                    break

for category in class_labels:
    plot_samples(trainset, category)


__Observation__

- We observe that sometimes despite being in the same category images still exhibit a wide variety of pictures object. There's a big difference on perspective, lightning, set...

## Category distributions

In [None]:
import seaborn as sns
import numpy as np

class_dist = {'test': testset.classes, 'valid': validset.classes,'train': trainset.classes}


fig, axes = plt.subplots(1, 3, figsize=(16, 3))

sns.histplot(trainset.classes, binwidth=1, binrange=(0,6), stat='probability', ax=axes[0])
axes[0].set_title('Training set')
axes[0].set_xticks(np.arange(len(class_labels))+0.5)
axes[0].set_xticklabels(class_labels, rotation=45, ha='right')
axes[0].set_ylim(0,0.25)

sns.histplot(validset.classes, binwidth=1, binrange=(0,6), stat='probability', ax=axes[1])
axes[1].set_title('Validation set')
axes[1].set_xticks(np.arange(len(class_labels))+0.5)
axes[1].set_xticklabels(class_labels, rotation=45, ha='right')
axes[1].set_ylim(0,0.25)

sns.histplot(testset.classes, binwidth=1, binrange=(0,6), stat='probability', ax=axes[2])
axes[2].set_title('Testing set')
axes[2].set_xticks(np.arange(len(class_labels))+0.5)
axes[2].set_xticklabels(class_labels, rotation=45, ha='right')
axes[2].set_ylim(0,0.25)

__Observation__

- All three image datasets share identical class distribution.

## Color histogram of each category

Next we'll plot the accumulated color histogram for all images in the training dataset for each category.

In [None]:
# This function will compute the accumulated histograms for a given category

def compute_histograms(dataset, category):

    hist_r_total = np.zeros(256)
    hist_g_total = np.zeros(256)
    hist_b_total = np.zeros(256)

    # we reset the iterator to make sure we cover all images
    dataset.reset()

    # Cycle through all image batches 
    while True:
    
        images, labels = next(dataset)

        # Cycle through all images in current batch
        for n,label in enumerate(labels):
            if label[class_labels.index(category)] == 1:

                # Calculate histograms for each channel (R, G, B) separately
                hist_r, _ = np.histogram(images[n, :, :, 0].ravel()*256, bins=256, range=(0, 256))
                hist_g, _ = np.histogram(images[n, :, :, 1].ravel()*256, bins=256, range=(0, 256))
                hist_b, _ = np.histogram(images[n, :, :, 2].ravel()*256, bins=256, range=(0, 256))
                
                # Accumulate histograms
                hist_r_total += hist_r
                hist_g_total += hist_g
                hist_b_total += hist_b

        # Detect if this was the last batch
        if len(labels) < batch_size:
            break

    # Normalize histogram
    hist_r_total /= hist_r_total.sum()
    hist_g_total /= hist_g_total.sum()
    hist_b_total /= hist_b_total.sum()

    return hist_r_total, hist_g_total, hist_b_total

In [None]:
# Plot the accumulated color histograms
fig, axes = plt.subplots(2, 3, figsize=(18, 7))
axes = axes.ravel()
fig.suptitle('Accumulated Color Histogram for Training Dataset')

for k, category in enumerate(class_labels):
    
    hist_r_total, hist_g_total, hist_b_total = compute_histograms(trainset, category)

    axes[k].plot(hist_r_total, color='red', label='Red')
    axes[k].plot(hist_g_total, color='green', label='Green')
    axes[k].plot(hist_b_total, color='blue', label='Blue')

    axes[k].set_title(category, y=0.9)
    axes[k].set_xlabel('Pixel Intensity')
    axes[k].set_ylabel('Frequency')
    axes[k].set_ylim(0,0.014)
    axes[k].legend()

__Observation__:

- There are some structures/shapes hint at differentiate betweeen some groups of categories, 
- However, it would be almost impossible to differentiate the 6 categories with just the color informatiom.
- Additionally, here we observe the accumulated mean distribution of all images from a given category, but we don't have a view on the signal variance within each category. 

## Extraction of High-Level Features

We'll process our datasets through the MobileNet_v2 object detection model trained. This model detects high-level features and can serve a generic model for image classification. We start by importing the model

In [None]:
import tensorflow_hub as hub

# Create the image feature extractor
model_url = "https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/5"

feature_extractor = hub.load(model_url)

Next, we apply the model to our 3 datasets

In [None]:
# This function applies the model to all images in a dataset 
def extract_features(dataset):
    
    features = tf.zeros((0, 1280))
    labels = tf.zeros((0, 6))
    
    # we reset the iterator to make sure we start from first images
    dataset.reset()

    # cycle through batches
    while True:
        images_batch, labels_batch = next(dataset)
        features = tf.concat([features, feature_extractor(images_batch)], axis=0)
        labels = tf.concat([labels, labels_batch], axis=0)
        
        # detect last batch
        if len(labels_batch) < batch_size:
            break
            
    return features, labels

trainset_features, trainset_labels = extract_features(trainset)
validset_features, validset_labels = extract_features(validset)
testset_features, testset_labels = extract_features(testset)

print(f'Size of training set features: {trainset_features.shape}')
print(f'Size of validation set features: {validset_features.shape}')
print(f'Size of testing set features: {testset_features.shape}')

And finally we save computed features to numpy .npz file.

In [None]:
# Save to an .npz file
np.savez('dataset_features.npz', 
         trainset_features=trainset_features.numpy(),
         validset_features=validset_features.numpy(),
         testset_features=testset_features.numpy(),
         trainset_labels=trainset_labels.numpy(),
         validset_labels=validset_labels.numpy(),
         testset_labels=testset_labels.numpy(),
         class_labels=class_labels       
         )

## Heatmap of feature intensity for each category

We next plot the feature value for all images in each category

In [None]:
from util import decode_class

fig, axes = plt.subplots(2,3,figsize=(15,10))
axes = axes.ravel()

# Iterate over each category label
for category_idx, category_name in enumerate(class_labels):
    # Select features for the current category
    category_features = trainset_features[decode_class(trainset_labels) == category_idx]
    
    # Check if there are samples in this category
    sns.heatmap(category_features, cbar=True, ax=axes[category_idx])    
    axes[category_idx].set_title(f'{category_name}')

axes[3].set_xlabel('Feature Index')
axes[4].set_xlabel('Feature Index')
axes[5].set_xlabel('Feature Index')

axes[0].set_ylabel('Sample Index')
axes[3].set_ylabel('Sample Index')

We next repeat the exercise but this time, we average over all images in each category.

In [None]:
# we have a created a function util.decode_class() which decodes on-hot encoded class vectors

# Initialize a dictionary to store accumulated feature intensities
accumulated_features = {}

# Accumulate feature values for each category
for category_idx, category_name in enumerate(class_labels):

    # Select features for the current category and sum them across all samples
    category_features = trainset_features[decode_class(trainset_labels) == category_idx]
    accumulated_features[category_name] = np.mean(category_features, axis=0)

# Plot heatmap for each category's accumulated features
plt.figure(figsize=(14, 4))
sns.heatmap([accumulated_features[category] for category in class_labels], 
            cbar=True, yticklabels=class_labels)
plt.title('Accumulated Feature Intensity for Each Category')
plt.xlabel('Feature Index')
plt.ylabel('Category')


__Observation__:

- In the bottom plot, the intensities across the x-axis (features) indicate that some features contribute more strongly to the classification of certain categories (brighter bands in certain regions). This can be seen as a signature for each category which will be used by the models of next section to create a classifier for our objects/classes.

- In the top plots, the variability along the y-axis (samples) indicate  how different images in the same category can activate slightly different high-level features. 

- It's interesting to see how for the category 'other' which by definition can contain different types of objects there seems to be almost repeating vertical structure among the different samples

## Top features of each category

Next we'll obtain the top valued high-level features for each category

In [None]:
# Initialize a dictionary to store mean values for each category
top_features = {}

# Calculate mean feature values for each category and identify top 5 features
for category_label, category_name in enumerate(class_labels):
    
    # Select features for the current category
    category_features = trainset_features[decode_class(trainset_labels) == category_label]
    
    # Calculate mean of each feature across all samples in the category
    feature_means = np.mean(category_features, axis=0)
    
    # Find top 5 features with the highest mean values
    top_5_indices = np.argsort(feature_means)[-5:]
    top_5_features = [(idx, feature_means[idx]) for idx in top_5_indices]
    
    # Store top features in the dictionary
    top_features[category_name] = top_5_features

In [None]:
# Display the top 5 features for each category
for category, features in top_features.items():
    print(f"Top 5 features for '{category}':")
    
    for feature_idx, mean_value in reversed(features):
        print(f" {feature_idx:5d}: Mean Value = {mean_value:.3f}")
        
    print()

In [None]:
for category, feature_tuple in top_features.items():
    print(f"{category:}: {[feature_idx for feature_idx, _ in feature_tuple]}")

In [None]:
import pandas as pd

# Extract feature indices from all categories
all_feature_indices = [feature_idx for category in top_features.values() for feature_idx, _ in category]

print("Repeated features:")
pd.DataFrame(all_feature_indices).value_counts().head(10)


__Observation:__

- The top 5 high-level features for each category are almost unique
- Only 3 features (183, 1022 and 580) are repeated among two categories
- Feature 183  shows in 'bike' and 'car'
- Feature 1022 shows in 'truck' and 'van'
- Feature 580  shows in 'truck' and 'other'