# Feature Extraction: HOG

## Setup

In [None]:
#import libraries
import webdataset as wds
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

from skimage.feature import hog
from skimage import data, exposure
from sklearn.manifold import TSNE
import cv2 as cv

In [48]:
#paths
root = '.'
data_folder = f'{root}/data'
feature_data_folder = f'{root}/feature_data'

## Load Data

In [None]:
def load_data(path_to_data):
    """
    load 2D images and their labels

    returns: 
    images (array): array of images as np arrays
    labels (array): array of labels
    """
    #create lists to store images and labels
    images = []
    labels = []
    #load webdataset
    data = (wds.WebDataset(f'{path_to_data}', shardshuffle=True)
            .decode("pil").to_tuple("jpg", "json"))
    #iterate dataset
    data_iter = iter(data)
    #append dataset to lists
    for row in data_iter:
        image = np.array(row[0])
        label = row[1]['label']
        images.append(image)
        labels.append(label)

    images = np.array(images)
    labels = np.array(labels)
    data.close()
    return images, labels      

In [None]:
train_images, train_labels = load_data(f'file:{data_folder}/train-000000.tar')
val_images, val_labels = load_data(f'file:{data_folder}/val-000000.tar')
test_images, test_labels = load_data(f'file:{data_folder}/test-000000.tar')

In [None]:
print(train_labels[0])
plt.imshow(train_images[0])

In [None]:
print("Shape of images:", train_images.shape)
print("Shape of labels:", train_labels.shape)

## Quick EDA

In [None]:
unique, counts = np.unique(train_labels, return_counts=True)
class_dict = dict(zip(unique, counts))
#plot
plt.bar(range(len(class_dict)), list(class_dict.values()), align='center')
plt.xticks(range(len(class_dict)), list(class_dict.keys()), rotation = 'vertical')
plt.title('Mars Images Class Distribution')
plt.xlabel('Classes')
plt.ylabel('Counts')
plt.show()

In [None]:
#show example of 1 of each class
label_names = ['crater', 'dark dune', 'slope streak', 'bright dune', 'impact ejecta', 'swiss cheese', 'spider']
result = {}
unique_values = np.unique(train_labels)
for value in unique_values:
    indices = np.where(train_labels==value)[0][:1]
    result[value] = np.int_(indices)[0]
print(result)
#plotting first image for each class
fig, axs = plt.subplots(nrows=1, ncols=len(result), figsize=(20,3))
i = 0
for key, value in result.items():
    image = train_images[value]
    label = label_names[key-1]
    #label_name = label_names[label]
    axs[i].imshow(image)
    axs[i].set_title(label)
    axs[i].axis('off')
    i += 1
fig.suptitle('One Image For Each Class')
plt.show()

## Feature Extraction: HOG

In [None]:
#start with 1 example for each class for visualization
fig, axs = plt.subplots(nrows=1, ncols=len(result), figsize=(20,3))
i = 0
for key, value in result.items():
    image = train_images[value]
    label = label_names[key-1]
    #testing gaussian blur
    image = cv.GaussianBlur(image,(5,5),0)
    #normalize
    image = cv.normalize(image, None, 0, 1, cv.NORM_MINMAX, cv.CV_32F)
    fd, hog_image = hog(image, orientations = 10, pixels_per_cell=(12,12),
                        cells_per_block= (1,1), visualize = True, channel_axis=-1)
    axs[i].imshow(hog_image)
    axs[i].set_title(label)
    axs[i].axis('off')
    i += 1
fig.suptitle('One HOG Image For Each Class')
plt.show()
    

In [None]:
#apply to all
def extract_hog(img_dataset):
    """ 
    Apply HOG feature extraction to all training images 
    args: img_dataset (training / testing dataset)

    returns: train_images_hog (dataset of hog features)
    """
    img_dataset_hog = []
    for img in img_dataset:
        fd, hog_image = hog(image, orientations = 8, pixels_per_cell=(16,16),
                        cells_per_block= (1,1), visualize = True, channel_axis=-1)
        
        img_dataset_hog.append(hog_image.ravel()) #flatten

    img_dataset_hog = np.array(img_dataset_hog)
    return img_dataset_hog

In [None]:
train_images_hog = extract_hog(train_images)

In [None]:
train_images_hog.shape

In [None]:
val_images_hog = extract_hog(val_images)
test_images_hog = extract_hog(test_images)

In [49]:
#save files
np.save(f'{feature_data_folder}/train_images_hog', train_images_hog)
np.save(f'{feature_data_folder}/val_images_hog', val_images_hog)
np.save(f'{feature_data_folder}/test_images_hog', test_images_hog)

## PCA

In [None]:
pca = PCA()
pca.fit(train_images_hog)

In [None]:
explained_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 5))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid(True)
plt.show()

## t-SNE

In [None]:
#tsne = TSNE(n_components = 2, random_state = 0)
#X = tsne.fit_transform(train_images_hog)
#y = [label_names[label_num] for label_num in train_labels]