## Prepare Images dataset to CSV files for preprcessing

In [2]:
IMG_SIZE = (64, 64)

In [3]:
import os
from tqdm import tqdm
import cv2
import numpy as np

LABELS = {
    "Apple": 0,
    "Cherry": 1,
    "Tomato": 2
}

def process_dataset(root_path):
    data = []
    labels = []

    # Loops over main classes (Apple, Cherry, Tomato)
    for fruit in os.listdir(root_path):
        fruit_path = os.path.join(root_path, fruit)

        if not os.path.isdir(fruit_path):
            continue
        
        # convert fruit name to label
        label = LABELS.get(fruit, None)
        if label is None:
            print("Skipping unknown folder:", fruit)
            continue

        # loops over varieties in each class
        for variety in os.listdir(fruit_path):
            variety_path = os.path.join(fruit_path, variety)

            if not os.path.isdir(variety_path):
                continue

            print(f"Processing {fruit}/{variety}")

            # loops over images in each variety
            for img_name in tqdm(os.listdir(variety_path)):
                img_path = os.path.join(variety_path, img_name)

                img = cv2.imread(img_path)
                if img is None:
                    continue

                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

                img = cv2.resize(img, IMG_SIZE)

                data.append(img)
                labels.append(label)

    return np.array(data), np.array(labels)


In [4]:
train_path = "..\\dataset\\fruits-360-3-body-problem\\train"

print("Processing TRAIN dataset...")
X_train, y_train = process_dataset(train_path)

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

Processing TRAIN dataset...
Processing Apple/Apple 11


100%|██████████| 572/572 [00:04<00:00, 118.24it/s]


Processing Apple/Apple 12


100%|██████████| 620/620 [00:05<00:00, 122.94it/s]


Processing Apple/Apple 13


100%|██████████| 934/934 [00:07<00:00, 120.43it/s]


Processing Apple/Apple 14


100%|██████████| 620/620 [00:05<00:00, 117.04it/s]


Processing Apple/Apple 17


100%|██████████| 811/811 [00:06<00:00, 121.76it/s]


Processing Apple/Apple 18


100%|██████████| 644/644 [00:05<00:00, 125.55it/s]


Processing Apple/Apple 5


100%|██████████| 586/586 [00:04<00:00, 123.75it/s]


Processing Apple/Apple 7


100%|██████████| 923/923 [00:07<00:00, 123.24it/s]


Processing Apple/Apple 8


100%|██████████| 915/915 [00:07<00:00, 124.54it/s]


Processing Apple/Apple 9


100%|██████████| 925/925 [00:07<00:00, 123.84it/s]


Processing Apple/Apple Braeburn 1


100%|██████████| 656/656 [00:05<00:00, 125.90it/s]


Processing Apple/Apple Crimson Snow 1


100%|██████████| 592/592 [00:04<00:00, 122.55it/s]


Processing Apple/Apple Golden 2


100%|██████████| 656/656 [00:05<00:00, 127.94it/s]


Processing Apple/Apple Golden 3


100%|██████████| 642/642 [00:05<00:00, 123.33it/s]


Processing Apple/Apple hit 1


100%|██████████| 936/936 [00:07<00:00, 121.96it/s]


Processing Apple/Apple Pink Lady 1


100%|██████████| 608/608 [00:05<00:00, 106.13it/s]


Processing Apple/Apple Red 2


100%|██████████| 656/656 [00:06<00:00, 103.21it/s]


Processing Apple/Apple Red 3


100%|██████████| 573/573 [00:04<00:00, 116.61it/s]


Processing Apple/Apple Red Delicious 1


100%|██████████| 656/656 [00:05<00:00, 110.78it/s]


Processing Apple/Apple Red Yellow 2


100%|██████████| 891/891 [00:07<00:00, 118.74it/s]


Processing Apple/Apple Rotten 1


100%|██████████| 650/650 [00:05<00:00, 117.25it/s]


Processing Apple/Apple worm 1


100%|██████████| 927/927 [00:07<00:00, 118.24it/s]


Processing Cherry/Cherry 1


100%|██████████| 656/656 [00:05<00:00, 119.03it/s]


Processing Cherry/Cherry 2


100%|██████████| 984/984 [00:08<00:00, 120.19it/s]


Processing Cherry/Cherry 4


100%|██████████| 930/930 [00:07<00:00, 117.18it/s]


Processing Cherry/Cherry 5


100%|██████████| 921/921 [00:07<00:00, 119.37it/s]


Processing Cherry/Cherry Rainier 2


100%|██████████| 927/927 [00:08<00:00, 114.69it/s]


Processing Cherry/Cherry Rainier 3


100%|██████████| 914/914 [00:07<00:00, 117.42it/s]


Processing Cherry/Cherry Sour 1


100%|██████████| 606/606 [00:05<00:00, 119.18it/s]


Processing Cherry/Cherry Wax Black 1


100%|██████████| 656/656 [00:05<00:00, 118.98it/s]


Processing Cherry/Cherry Wax not ripen 2


100%|██████████| 304/304 [00:02<00:00, 111.57it/s]


Processing Cherry/Cherry Wax Red 1


100%|██████████| 656/656 [00:05<00:00, 118.64it/s]


Processing Cherry/Cherry Wax Red 2


100%|██████████| 602/602 [00:05<00:00, 117.69it/s]


Processing Cherry/Cherry Wax Yellow 1


100%|██████████| 656/656 [00:05<00:00, 119.82it/s]


Skipping unknown folder: Tomatoe
Shape of X_train: (24805, 64, 64)
Shape of y_train: (24805,)


In [5]:
test_path = "..\\dataset\\fruits-360-3-body-problem\\test"

print("Processing TEST dataset...")
X_test, y_test = process_dataset(test_path)

print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Processing TEST dataset...
Processing Apple/Apple 10


100%|██████████| 930/930 [00:08<00:00, 107.78it/s]


Processing Apple/Apple 19


100%|██████████| 970/970 [00:11<00:00, 81.98it/s]


Processing Apple/Apple 6


100%|██████████| 630/630 [00:07<00:00, 84.79it/s]


Processing Apple/Apple Golden 1


100%|██████████| 640/640 [00:06<00:00, 93.87it/s] 


Processing Apple/Apple Granny Smith 1


100%|██████████| 656/656 [00:06<00:00, 97.44it/s] 


Processing Apple/Apple Red 1


100%|██████████| 656/656 [00:07<00:00, 92.88it/s] 


Processing Apple/Apple Red Yellow 1


100%|██████████| 656/656 [00:07<00:00, 88.40it/s] 


Processing Cherry/Cherry 3


100%|██████████| 930/930 [00:10<00:00, 90.83it/s] 


Processing Cherry/Cherry Rainier 1


100%|██████████| 984/984 [00:09<00:00, 104.35it/s]


Processing Cherry/Cherry Wax not ripen 1


100%|██████████| 900/900 [00:08<00:00, 101.65it/s]


Processing Cherry/Cherry Wax Red 3


100%|██████████| 604/604 [00:05<00:00, 111.09it/s]

Skipping unknown folder: Tomatoe
Shape of X_test: (8556, 64, 64)
Shape of y_test: (8556,)





In [6]:
import pandas as pd

#Saving processed data to CSV files
X_train_flat = X_train.reshape(X_train.shape[0], -1)

train_df = pd.DataFrame(X_train_flat)
train_df["label"] = y_train

X_test_flat = X_test.reshape(X_test.shape[0], -1)
test_df = pd.DataFrame(X_test_flat)
test_df["label"] = y_test

train_df.to_csv("processed_data/train.csv", index=False)
test_df.to_csv("processed_data/test.csv", index=False)

## Preprocess Phase

#### Handle Nulls and duplicates
There is no missing values in our dataset

In [14]:
print("Checking for null values in train and test datasets:")
print(train_df.isnull().sum())
print()
print(test_df.isnull().sum())

Checking for null values in train and test datasets:
0        0
1        0
2        0
3        0
4        0
        ..
4092     0
4093     0
4094     0
4095     0
label    0
Length: 4097, dtype: int64

0        0
1        0
2        0
3        0
4        0
        ..
4092     0
4093     0
4094     0
4095     0
label    0
Length: 4097, dtype: int64


## Feature Exctraction & Scaling

#### We use PCA and Standerd scalar

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [16]:
# Make sure all column names are strings
train_df.columns = train_df.columns.astype(str)
test_df.columns = test_df.columns.astype(str)

numeric_cols = train_df.select_dtypes(include=['int64', 'float64', 'uint8']).columns

# pipeline for scaling and PCA
num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=600, random_state=42))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_cols)
])

X_train_processed = preprocessor.fit_transform(train_df)
X_test_processed = preprocessor.transform(test_df)

print("Processed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)

Processed train shape: (24805, 600)
Processed test shape: (8556, 600)


## Save Processed Train / Test 

In [17]:
train_df_processed = pd.DataFrame(X_train_processed)
test_df_processed = pd.DataFrame(X_test_processed)

train_df_processed.to_csv("processed_data/train.csv", index=False)
test_df_processed.to_csv("processed_data/test.csv", index=False)

print("Saved processed training and test sets to CSV!")

Saved processed training and test sets to CSV!
