# Image Storage Processing 

In [1]:
# import libraries for data processing
import numpy as np
import pandas as pd

# import libraries for file manipulation + system management
import os
import requests
import shutil
from urllib.parse import urlparse

# import dependency from image processing toolkit
# from image_processing_tools import distribute_images
from image_processing_tools import train_test_images, move_images
from image_processing_tools import remove_images
from image_processing_tools import det_folder_size
from image_processing_tools import move_img_by_label

### ISP-Approach

+ Loading Images
+ Download images into source images (IF NEEDED)
+ Distributing stored images from source path to new destination path
+ Image Reorganisation based on labels

## Loading Images 

In [2]:
# load dataset 
image_ds_file = "image_dataset.csv"
image_df = pd.read_csv(image_ds_file, index_col=0)
image_df.head()

Unnamed: 0,Image_file,Labels,Model,Company
0,image_000.jpg,Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RA...,SM-A156B Galaxy,Samsung
1,image_001.jpg,Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 12...,SM-A556B Galaxy,Samsung
2,image_002.jpg,Motorola Moto G54 256GB Blue 5G Android Smartp...,Moto G54,Motorola
3,image_003.jpg,Xiaomi 14 Ultra 5G 16GB/512GB White (White) Du...,14 Ultra,Xiaomi
4,image_004.jpg,Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lav...,Galaxy S20,Samsung


In [3]:
image_df["Company"].value_counts()

Company
Samsung      345
Xiaomi       167
Apple        105
Nokia        103
Motorola      30
OnePlus       26
Redmi         13
Poco          10
Microsoft      1
Name: count, dtype: int64

## Distribution of stored images


+ Creating source and destination folders
+ Split image data into training and testing image sets
+ Move images to training and testing folders 

In [4]:
# data splitting: create constants for number of images in train, test and validation folders 
train_size = 800
test_size = 160
validation_size = 40    

In [5]:
# create list of folders
path = "D:\\Machine_Learning\\Portfolio_Project_Machine_Learning\\Mobile_Phone_Recognition\\datasets"
dataset_paths = os.listdir(path)
dataset_paths.remove("dataset_structure")
os.makedirs(os.path.join(path, "destination"), exist_ok=True)
print(f"Dataset Paths: {dataset_paths}")

# create constants: source folder and destination folder (train, test and validation folder) 
destination_path = os.path.join(path, dataset_paths[0])
source_path = os.path.join(path, dataset_paths[1])

# Identify the location of source and destination folders
for folder_name, path_idx in zip(["Destination Folder", "Source Folder"], dataset_paths):
  ds_path = os.path.join(path, path_idx)
  print(f"{folder_name} Folder: {ds_path}")

Dataset Paths: ['destination', 'source_images', 'source_images_v1', 'source_images_v2']
Destination Folder Folder: D:\Machine_Learning\Portfolio_Project_Machine_Learning\Mobile_Phone_Recognition\datasets\destination
Source Folder Folder: D:\Machine_Learning\Portfolio_Project_Machine_Learning\Mobile_Phone_Recognition\datasets\source_images


In [6]:
# Define the training and testing path
tt_path = ['test_images', 'train_images']
test_path = os.path.join(destination_path, tt_path[0])
train_path = os.path.join(destination_path, tt_path[1])

# update test size
test_size = [0.20,0.25,0.3]

# divide image data into training and testing sets
img_label_train, img_label_test = train_test_images(source_dir=source_path,
                destination_dir=destination_path,
                img_dataframe=image_df,
                test_size=test_size[0]
                )

In [7]:
# Identify image distribution corresponding to its label
for i,(image, label) in enumerate(img_label_train[:15]):
  print(f"Image {i}: {image} --> Brand: {label}")

# Initialise image training and testing set
img_train = np.array([image[0] for image in img_label_train])
img_test = np.array([image[0] for image in img_label_test])

# Initialise label training and testing set
label_train = np.array([label[1] for label in img_label_train])
label_test = np.array([label[1] for label in img_label_test])

print(f"Training images: {img_train.shape}")
print(f"Testing images: {img_test.shape}")

Image 0: image_264.jpg --> Brand: Samsung
Image 1: image_615.jpg --> Brand: Xiaomi
Image 2: image_329.jpg --> Brand: OnePlus
Image 3: image_342.jpg --> Brand: Nokia
Image 4: image_394.jpg --> Brand: Samsung
Image 5: image_079.jpg --> Brand: Xiaomi
Image 6: image_148.jpg --> Brand: Samsung
Image 7: image_350.jpg --> Brand: Nokia
Image 8: image_382.jpg --> Brand: Samsung
Image 9: image_440.jpg --> Brand: Nokia
Image 10: image_309.jpg --> Brand: Xiaomi
Image 11: image_707.jpg --> Brand: Apple
Image 12: image_133.jpg --> Brand: Motorola
Image 13: image_381.jpg --> Brand: Samsung
Image 14: image_213.jpg --> Brand: Nokia
Training images: (640,)
Testing images: (160,)


In [8]:
# Create dataframe --> storing image file and brand label
img_train_set = [{"Image": img, "Label": label} for img, label in zip(img_train, label_train)]
img_test_set = [{"Image": img, "Label": label} for img, label in zip(img_test, label_test)]

**Moving image data to seperate training and testing folders**

In [9]:
# # Training images: move mobile phone images by brand label
# move_images(image_list=img_train_set,
#             source_folder=source_path,
#             destination_folder=train_path)

# # Testing images: move mobile phone images by brand label
# move_images(image_list=img_test_set, 
#             source_folder=source_path,
#             destination_folder=test_path)

**Moving images to train and test folders based on its label**

In [10]:
# Directory to store train ang test images 
train_path = os.path.join(destination_path, "train")

# Create dataframe for training images 
img_train_df = pd.DataFrame()
img_train_df["Image"] = img_train
img_train_df["Label"] = label_train

# Create dataframe for testing images
img_test_df = pd.DataFrame()
img_test_df["Image"] = img_test
img_test_df["Label"] = label_test

In [15]:
img_train_df.head()

Unnamed: 0,Image,Label
0,image_264.jpg,Samsung
1,image_615.jpg,Xiaomi
2,image_329.jpg,OnePlus
3,image_342.jpg,Nokia
4,image_394.jpg,Samsung


In [None]:
# # move images to training folder by labels
move_img_by_label(target_df=img_train_df, label_df=img_train_df["Label"], 
                  source_dir=source_path,
                  target_dir=destination_path, target="train")

# move images to training folder by labels
move_img_by_label(target_df=img_test_df, label_df=img_test_df["Label"], 
                  source_dir=source_path,
                  target_dir=destination_path, target="test")

Data split complete!


Removing current folder (if necessary)

In [None]:
# remove images from current folder 
# remove_images(current_folder=destination_path)

Redistributing new images with updated train and test sizes:
+ Remove the remain exissting images with the old train and test sizes 
+ Update the train and test sizes
+ Redistribute the images with updated train and test sizes

## Descriptive Analysis on images based on its label

+ finding number of images per training, testing and validation folders
+ finding number of images per class label folders

In [19]:
distributed_path = "D:\\Machine_Learning\\Portfolio_Project_Machine_Learning\\Mobile_Phone_Recognition\\datasets\\destination"
distributed_path_list = os.listdir(distributed_path)

train_path = os.path.join(distributed_path, "train")
test_path = os.path.join(distributed_path, "test")
validation_pth = os.path.join(distributed_path, "validation")

print(f"Training folder: {train_path}")
print(f"Testing folder: {test_path}")
# print(f"Validation folder: {validation_path}")

Training folder: D:\Machine_Learning\Portfolio_Project_Machine_Learning\Mobile_Phone_Recognition\datasets\destination\train
Testing folder: D:\Machine_Learning\Portfolio_Project_Machine_Learning\Mobile_Phone_Recognition\datasets\destination\test


In [35]:
# define folder size: train, test, validation 
image_train_counter = det_folder_size(sel_path=train_path)
image_test_counter = det_folder_size(sel_path=test_path)
# image_val_counter = det_folder_size(sel_path=validation_pth)

# Training dataframe: convert into dataframe
train_counter_df = pd.DataFrame()
train_counter_df["Image_Folder"] = image_train_counter.keys()
train_counter_df["Train_size"] = image_train_counter.values()

# Training dataframe: convert into dataframe
test_counter_df = pd.DataFrame()
test_counter_df["Image_Folder"] = image_test_counter.keys()
test_counter_df["Test_size"] = image_test_counter.values()

In [39]:
train_counter_df

Unnamed: 0,Image_Folder,Train_size
0,Apple,87
1,Microsoft,1
2,Motorola,27
3,Nokia,85
4,OnePlus,19
5,Poco,9
6,Redmi,8
7,Samsung,268
8,Xiaomi,136


In [40]:
# train_counter_df["Train_size"].sum()
train_size = train_counter_df["Train_size"].sum()
test_size = test_counter_df["Test_size"].sum()

print(f"Training Size: {train_size}")
print(f"Testing Size: {test_size}")

Training Size: 640
Testing Size: 160


In [41]:
# Create dataframe: compute the total images 
image_total_set = pd.DataFrame()
image_total_set["Train_sum"] = pd.Series(train_counter_df["Train_size"].sum())
image_total_set["Test_sum"] = pd.Series(test_counter_df["Test_size"].sum())
# image_total_set["Valdation_sum"] = pd.Series(train_counter_df["Validation_size"].sum())

# image_total_set = image_total_set.transpose()
image_total_set.transpose()

Unnamed: 0,0
Train_sum,640
Test_sum,160
