* Querol, Lorenzo
* Permito, Joshua
* Pineda, Ralph
* Abello, Hans Matthew

# Import Libraries

In [36]:
import os
import glob

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import cv2
from tqdm.notebook import tqdm

# Function Definitions

In [37]:
"""
Helper function to load images from a directory. Images are also converted to RGB.

Inputs:
- path: path to directory containing images
"""
def load_images(path):
    images = []
    filenames = os.listdir(path)

    for filename in tqdm(filenames):
        image = cv2.imread(os.path.join(path, filename))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images.append(image.astype(np.uint8))

    return images

"""
Helper function organize images and labels

Inputs: 
- dir: directory of images
"""
def get_images_and_labels(dir):
    class_0 = load_images(f'{dir}/freshapples')
    class_1 = load_images(f'{dir}/freshbanana')
    class_2 = load_images(f'{dir}/freshoranges')
    class_3 = load_images(f'{dir}/rottenapples')
    class_4 = load_images(f'{dir}/rottenbanana')
    class_5 = load_images(f'{dir}/rottenoranges')
    
    class_0_num_samples = len(class_0)
    class_1_num_samples = len(class_1)
    class_2_num_samples = len(class_2)
    class_3_num_samples = len(class_3)
    class_4_num_samples = len(class_4)
    class_5_num_samples = len(class_5)

    labels = np.array(
        [0] * class_0_num_samples +
        [1] * class_1_num_samples +
        [2] * class_2_num_samples +
        [3] * class_3_num_samples +
        [4] * class_4_num_samples +
        [5] * class_5_num_samples)

    images = list(class_0) + list(class_1) + list(class_2) + \
        list(class_3) + list(class_4) + list(class_5)
    
    return images, labels

"""
Helper function to plot images

Inputs: 
- images: list of images to plot
"""
def plot_images(images):
    plt.figure(figsize=(9, 9))

    for i, image in enumerate(images[0:25]):
        plt.subplot(5, 5, i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(image)
        
    plt.show()

# Load Images

In [38]:
main_dir = './subsampled_fruits_dataset'
train_dir = f'{main_dir}/train'
test_dir = f'{main_dir}/test'

classnames = ['freshapples', 'freshbanana', 'freshoranges',
              'rottenapples', 'rottenbanana', 'rottenoranges']

idx2class = {i: classname for i, classname in enumerate(classnames)}

train_paths = glob.glob(f'{train_dir}/*/*.jpg', recursive=True)
test_paths = glob.glob(f'{test_dir}/*/*.jpg', recursive=True)

In [39]:
train_images, train_labels = get_images_and_labels(train_dir)
test_images, test_labels = get_images_and_labels(test_dir)

  0%|          | 0/423 [00:00<?, ?it/s]

  0%|          | 0/395 [00:00<?, ?it/s]

  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

  0%|          | 0/556 [00:00<?, ?it/s]

  0%|          | 0/398 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

# Driver Code

In [40]:
from feature_extraction import extract_features

def image_to_features(images, paths, labels):
    segmented_images = []
    columns = ['image', 'class', 'area', 'perimeter', 'circularity', 'convexity', 'red_mean', 'green_mean', 'blue_mean', 'red_std', 'green_std', 'blue_std', 'red_skew', 'green_skew',
               'blue_skew', 'red_kurt', 'green_kurt', 'blue_kurt', 'h_mean', 's_mean', 'v_mean', 'h_std', 's_std', 'v_std', 'h_skew', 's_skew', 'v_skew', 'h_kurt', 's_kurt', 'v_kurt']

    features_df = pd.DataFrame(columns=columns)

    for i, image in enumerate(tqdm(images)):
        # Apply segmentation and extract features
        segmented_image, features = extract_features(image)

        # Append segmented image to list
        segmented_images.append(segmented_image)

        # Append features to dataframe
        feature_vector = {'image': paths[i], 'class': labels[i]}
        feature_vector.update(features)
        features_df = pd.concat([features_df, pd.DataFrame([feature_vector])], ignore_index=True)

    features_df['class'] = features_df['class'].map(idx2class)

    return features_df

In [41]:
train_features_df = image_to_features(train_images, train_paths, train_labels)
test_features_df = image_to_features(test_images, test_paths, test_labels)

  0%|          | 0/2723 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

**Sanity Check!**

In [42]:
print("Train features:")
print(f"Number of samples: {len(train_features_df)}")
print(f"Number of features: {len(train_features_df.drop(columns=['image', 'class']).columns)}")
print(f"Number of unique classes: {len(train_features_df['class'].unique())}")

print("\nTest features:")
print(f"Number of samples: {len(test_features_df)}")
print(f"Number of features: {len(test_features_df.drop(columns=['image', 'class']).columns)}")
print(f"Number of unique classes: {len(test_features_df['class'].unique())}")

Train features:
Number of samples: 2723
Number of features: 28
Number of unique classes: 6

Test features:
Number of samples: 672
Number of features: 28
Number of unique classes: 6


# Convert to CSV

In [44]:
train_features_df.to_csv('train_fruits_dataset.csv', index=False)
test_features_df.to_csv('test_fruits_dataset.csv', index=False)