# Step 2. Data Preparation

---

This step includes both data cleaning and data processing. The resulting data will be saved into `./data/processed/` folder under the same class-separated folder structure as the raw dataset. The steps are as follows:

1. Remove Transparency and Grayscale: Convert images of all other modes to `RGB`, excluding `L` images (grayscale).
2. Standardize Sizes and Aspect Ratios: **512x512** pixels, 1:1 aspect ratio.
3. Remove Identical Images (Data Duplication). Method: Comparing `imagehash.average_hash()` of each image.
4. Normalize Pixel Values: common scale: [0, 1].
5. Quality Control: Manual inspection of a subset of images to ensure overall dataset quality.

In [None]:
# Imports and environmental setups

import os
import numpy as np
import matplotlib.pyplot as plt
import imagehash
from PIL import Image


# Custom utils functions
from utils import dataset
from utils.visualization import data_countplot
from utils.visualization import data_histplot
from utils.visualization import data_plot_16samples


plt.style.use('ggplot')
path_raw_dataset = '../data/raw/'
path_processed_dataset = '../data/processed/'

## Load Raw Dataset

Using our custom `dataset.load()` function.

In [None]:
df_raw = dataset.load('../data/raw')
df_raw.info()

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(15, 10))

data_countplot(df_raw, 'FileType', ax=ax[0][0], title='File Types', annotate=True, palette='Set2')
data_histplot(df_raw, 'Ratio', ax=ax[0][1], title='Image Ratio Density', bins=20, kde=True, stat='density')
data_countplot(df_raw, 'Mode', ax=ax[0][2], title='Image Modes', annotate=True, palette='Accent')
data_countplot(df_raw, 'Transparency', ax=ax[1][0], title='Transparency', annotate=True, palette='YlGn')
data_countplot(df_raw, 'Animated', ax=ax[1][1], title='Animated', annotate=True, palette='YlGn')
data_countplot(df_raw, 'Class', ax=ax[1][2], horizontal=True, title='Flower Classes', xticklabels_rotation=45, annotate=True, palette='Set1')

fig.suptitle('Raw Dataset Statistics', fontsize=20, fontweight='bold', y=1.0)
fig.tight_layout()

In [None]:
data_plot_16samples(df_raw)

## 2.1. Data Cleaning and Processing

### 2.1.1. Duplicated Images Cleaning

We are detecting all duplicated images and store them in a list `skipped_duplicated_images` (keeping the first copy) so that we can skip cleaning and processing and saving them into our processed dataset in `../data/processed/`:

In [None]:
image_hashes = {}
for i, row in df_raw.iterrows():
    with Image.open(f'../data/raw/{row["ImgPath"]}') as im:
        image_hash = imagehash.average_hash(im, hash_size=8)
        if image_hash in image_hashes:
            image_hashes[image_hash].append(row["ImgPath"])
        else:
            image_hashes[image_hash] = [row["ImgPath"]]

duplicated_image_hashes = {hash_val: paths for hash_val, paths in image_hashes.items() if len(paths) > 1}  # Remove hashes with a single path

skipped_duplicated_images = []
for paths in duplicated_image_hashes.values():
    for i, path in enumerate(paths):
        if i > 0:  # Keeping the first copy
            skipped_duplicated_images.append(path)

print(f'Number of skipped duplicated images: {len(skipped_duplicated_images)}')

### 2.1.3. Further Cleaning and Processing

- Remove Transparency and Grayscale
- Standardize Sizes and Aspect Ratios
- Normalize Pixel Values

In [None]:
COMMON_DIM = 512  # Target common dimension (width and height for all processed images)

In [None]:
def remove_transparency(image: Image) -> Image:
    if im.mode in ('RGBA', 'RGBa', 'LA', 'La', 'PA', 'P'):
        if image.mode != 'RGBA':
            image = image.convert('RGBA')
        image = image.convert('RGB')
    return image

In [None]:
def resize_crop(image: Image, width: int, height: int) -> Image:
    original_ratio = image.width / image.height

    # Determine the target dimensions
    if original_ratio > width / height:
        target_width = width
        target_height = int(width / original_ratio)
    else:
        target_width = int(height * original_ratio)
        target_height = height
    image = image.resize((target_width, target_height))  # Resize to target dimensions

    # Centered crop
    left = (target_width - width) // 2
    top = (target_height - height) // 2
    right = left + width
    bottom = top + height
    image = image.crop((left, top, right, bottom))

    return image

In [None]:
def normalize_pixels(image: Image) -> Image:
    image_array = np.array(image)
    normalized_image_array = image_array / 255.0  # Normalize pixel values to the range [0, 1]
    return Image.fromarray((normalized_image_array * 255).astype(np.uint8))

In [None]:
for i, row in df_raw.iterrows():
    if not os.path.exists(f'{path_processed_dataset}{row["Class"]}'):
        os.makedirs(f'{path_processed_dataset}{row["Class"]}')

    img_path = row['ImgPath']
    if img_path in skipped_duplicated_images:
        continue

    new_img_path = ''.join(img_path.split('.')[0:-1]) + '.jpg'
    with Image.open(f'{path_raw_dataset}{img_path}') as im:
        if im.mode == 'L':
            continue  # Ignoring grayscale images

        im = remove_transparency(im)
        im = resize_crop(im, COMMON_DIM, COMMON_DIM)

        im.save(f'{path_processed_dataset}{new_img_path}')

## 2.2. Quality Control

In [None]:
df_processed = dataset.load('../data/processed')
df_processed.info()

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(15, 10))

data_countplot(df_processed, 'FileType', ax=ax[0][0], title='File Types', annotate=True, palette='Set2')
data_countplot(df_processed, 'Ratio', ax=ax[0][1], title='Image Ratio', annotate=True)
data_countplot(df_processed, 'Mode', ax=ax[0][2], title='Image Modes', annotate=True, palette='Accent')
data_countplot(df_processed, 'Transparency', ax=ax[1][0], title='Transparency', annotate=True, palette='YlGn')
data_countplot(df_processed, 'Animated', ax=ax[1][1], title='Animated', annotate=True, palette='YlGn')
data_countplot(df_processed, 'Class', ax=ax[1][2], horizontal=True, title='Flower Classes', xticklabels_rotation=45, annotate=True, palette='Set1')

fig.suptitle('Processed Dataset Statistics', fontsize=20, fontweight='bold', y=1.0)
fig.tight_layout()

fig.savefig('images/DataPrep-ProcessedDataset-Stats.png')

In [None]:
data_plot_16samples(df_processed, to_file='images/DataPrep-ProcessedDataset-16Samples.png')