In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

## authorship information
__author__ = "Alex Tavkhelidze"<br>
__credits__ = ["Bernd Brinkmann", "Luigi Menale", "Arif Haidari", "Romain Lesieur"]<br>
__email__ = <a href="mailto:&#106;&#101;&#111;&#115;&#064;&#109;&#097;&#105;&#108;&#046;&#099;&#111;&#109;">jeos [at] mail [dot] com</a><br>
__status__ = "Development"<br>
__project__ = "Plant Recognition"<br>
__scope__ = "DataScientest's Bootcamp in Data Science"

#### materials used besides own contribution:
the prototype code file 'step0_analysis.ipynb', provided by the teammate Luigi Menale<br>
https://hachoir.readthedocs.io/en/latest/developer.html

In [None]:
"""Creates a dataset with the row-wise stored selected metadata per each image:
   extracts selected metadata from images
   populates a dictionary with extracted metadata
   exports the populated dictionary as a .csv file
"""

In [1]:
from sys import argv, stderr, exit
import os
from typing import Tuple
from collections import defaultdict

from hachoir.parser import createParser
from hachoir.metadata import extractMetadata
import imageio.v3 as iio
import pandas as pd
import numpy as np

In [2]:
# defines the list with the descriptors of the image metadata to be fetched
col_oi = ["Image width", "Image height", "Bits/pixel", "Pixel format", "MIME type"]

In [3]:
# helper function
def image_metadata(filepath: str) -> Tuple[int, int, int, str, str]:
    """Returns all the metadata related to the file passed as input.

    Args:
        filepath (str): the file path of the target image

    Returns:
        Tuple[int, int, int, str, str]: 
            width in pixels as int, 
            height in pixels as int, 
            bits/pixel as int, 
            pixel format as string (RGB, YCbCr etc.), 
            MIME type as string (png, jpeg etc.).
    """
    
    if len(argv) != 2:
        print("usage: %s filename" % argv[0], file=stderr)
        exit(1)

    col_names = []
    values = []

    parser = createParser(filepath)

    if not parser:
        print("Unable to parse file", file=stderr)
        exit(1)

    with parser:
        try:
            metadata = extractMetadata(parser)
        except Exception as err:
            print("Metadata extraction error: %s" % err)
            metadata = None
    if not metadata:
        print("Unable to extract metadata")
        exit(1)

    for line in metadata.exportPlaintext()[1:]:
        line = line.removeprefix('- ')  
        splist = line.split(": ")
        if splist[0] in col_oi:         
            col_names.append(splist[0])
            values.append(splist[1].removesuffix(' pixels').removeprefix('image/'))

    width_px = int(values[col_names.index("Image width")])
    height_px = int(values[col_names.index("Image height")])
    bits_p_px = int(values[col_names.index("Bits/pixel")])
    px_format = values[col_names.index("Pixel format")]
    mime = values[col_names.index("MIME type")]

    return width_px, height_px, bits_p_px, px_format, mime

In [4]:
# helper function
def populate_dataset(dataset: dict, directory: str):
    """Populates the passed dictionary object.

    Args:
        dataset (dict): the dictionary that will be populated
        directory (str): the directory embracing all the data

    Returns:
        None
    """
    # loops through all the files and folders in the directory
    for folder_name in os.listdir(directory):
        if os.path.isdir(os.path.join(directory, folder_name)):
            for file_name in os.listdir(os.path.join(directory, folder_name)):
                file_path = os.path.join(directory, folder_name, file_name)
                width_px, height_px, bits_p_px, px_format, mime = image_metadata(file_path)
                image=iio.imread(file_path)
                # retrieves the following stack of image metadata
                dataset["folder_name"].append(folder_name)
                dataset["file_name"].append(file_name)
                dataset["width_px"].append(width_px)
                dataset["height_px"].append(height_px)
                dataset["bits_p_px"].append(bits_p_px)
                dataset["px_format"].append(px_format)
                dataset["mime"].append(mime)
                # fetches the number of channels (3rd dimension of the image array) 
                dataset["channels"].append(image.shape[2])
                # aggregates pixel-based major statistical features pro channel
                for chn in range(3):
                    dataset[f"chn_{chn}_px_std"].append(np.round(image[:,:,chn].std(),1))
                    dataset[f"chn_{chn}_px_min"].append(np.percentile(image[:,:,chn],0))
                    dataset[f"chn_{chn}_px_q1"].append(np.percentile(image[:,:,chn],25))
                    dataset[f"chn_{chn}_px_med"].append(np.percentile(image[:,:,chn],50))
                    dataset[f"chn_{chn}_px_avg"].append(np.round(image[:,:,chn].mean(),1))
                    dataset[f"chn_{chn}_px_q3"].append(np.percentile(image[:,:,chn],75))
                    dataset[f"chn_{chn}_px_max"].append(np.percentile(image[:,:,chn],100))
                    dataset[f"chn_{chn}_px_sum"].append(image[:,:,chn].sum())

In [21]:
# TODO: replace the arguments with your full path directory names - TODO: for Unix systems, erase the first argument

# assuming 'dir6' is the main directory containing all the class-representing folders with respective image files
dataset_dir = os.path.join("C:",os.sep,"Users","username","dir1","dir2","dir3","dir4","dir5","dir6") 

In [None]:
data = defaultdict(list)

populate_dataset(data, dataset_dir)

df = pd.DataFrame(data)

# exports the DataFrame object as the .csv file
df.to_csv(os.path.join(dataset_dir,"metadata_raw.csv"), index=False)

***