## Image Extraction

In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import os 

Image Extraction: Procedure

+ Loading the mobile phone image dataset
+ Image Analysis: identify the distribution and the graphical representation of an image
+ Dowloading the images
+ Storing the images in seperate folders: train, test and validation

### Loading mobile image dataset

1. Select a folder

In [6]:
# Implement function for selecting a folder 
def select_folder(sel_folder: str) -> str:
    # Initialise project path and data folder name
    proj_path = "C:\\Development\\Projects\\MachineLearning\\Mobile-Image_Classifier-System"
    source_folder_name = "data"

    # Create source folder variable
    source_folder = os.path.join(proj_path, source_folder_name)

    # Iteration: check if all required folders exist
    for folder in os.listdir(source_folder):
        try:
            # Create folder path variable
            folder_path = os.path.join(source_folder, folder)

            # Check if folder exist
            if not os.path.exists(folder_path):
                # Create a new existing folder
                new_folder = os.path.join(folder_path, folder)
                os.mkdir(new_folder)
        except OSError:
            # print(folder_path)
            print(new_folder)

    # Select the required folder 
    if sel_folder in os.listdir(source_folder):
        selected_folder_path = os.path.join(source_folder, sel_folder)
        return selected_folder_path


2. Select a csv-file from a folder

In [7]:
# Select a CSV-file from a folder
def select_csv_file(filename: str, folder: str) -> str:
    # Select a folder
    sel_folder = select_folder(folder)
    
    # Select a csv-file (if it exists)
    csv_filename = f"{filename}.csv"
    
    if csv_filename in os.listdir(sel_folder):
        filename_path = os.path.join(sel_folder, csv_filename)
        return filename_path

    # return sel_folder

In [None]:
# Load the dataset
csv_file = select_csv_file(filename="mobiles", folder="datasets")
data = pd.read_csv(csv_file, index_col=0)
data.head(1)

# Rename the columns
# data.columns

Unnamed: 0,Names,Image_Links,Stars,Rating&Reviews,Price_Details,Memory,Camara_Info,Display,Battery,Processor,Warranty
0,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB)",https://rukminim1.flixcart.com/image/312/312/x...,4.4,"1,20,759 Ratings & 7,003 Reviews","₹9,699\n₹14,99935% off",4 GB RAM | 64 GB ROM | Expandable Upto 1 TB,50MP + 5MP + 2MP | 8MP Front Camera,16.76 cm (6.6 inch) Full HD+ Display,6000 mAh Lithium Ion Battery,Exynos 850 Processor,1 Year Warranty Provided By the Manufacturer f...


## Image Analysis 

+ Identify the image distribution: number of images per mobile phone brand class
+ Detect irrelevant images and nullvalues

In [9]:
# Extract image URLS
image_ds = data[["Image_Links", "Names"]]
image_ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1148 entries, 0 to 1147
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image_Links  1143 non-null   object
 1   Names        943 non-null    object
dtypes: object(2)
memory usage: 26.9+ KB


In [10]:
# Number of nullvalues for each image feature data 
miss_vals_df = image_ds.isnull().sum().to_frame().reset_index()
miss_vals_df = miss_vals_df.rename(columns={
    "index": "Image_Feature",
    0: "Nullvalues" 
})

# Add columns of real values
real_img_links = image_ds["Image_Links"].shape[0] - 5
real_names = image_ds["Names"].shape[0] - 205

miss_vals_df["Real_values"] = pd.Series([real_img_links, real_names])
miss_vals_df

Unnamed: 0,Image_Feature,Nullvalues,Real_values
0,Image_Links,5,1143
1,Names,205,943


**Visualise distribution of null values : CHECK LATER!!

In [11]:
# # Create a column chart (vertical) 
# # Initialise constants: real values, null values and image features 
# image_links = np.array([5, 1143])
# names = np.array([205, 943])

# x_img_labels = np.array(["Nullvalues", "Real_values"])

# # Create 
# nd = np.arange(len())  # the x locations for the groups
# width = 0.35  # the width of the bars

# fig, ax = plt.subplots()
# rects1 = ax.bar(ind - width/2, men_means, width, yerr=men_std,
#                 label='Men')
# rects2 = ax.bar(ind + width/2, women_means, width, yerr=women_std,
#                 label='Women')

# # Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_ylabel('Scores')
# ax.set_title('Scores by group and gender')
# ax.set_xticks(ind)
# ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))
# ax.legend()


# def autolabel(rects, xpos='center'):
#     """
#     Attach a text label above each bar in *rects*, displaying its height.

#     *xpos* indicates which side to place the text w.r.t. the center of
#     the bar. It can be one of the following {'center', 'right', 'left'}.
#     """

#     ha = {'center': 'center', 'right': 'left', 'left': 'right'}
#     offset = {'center': 0, 'right': 1, 'left': -1}

#     for rect in rects:
#         height = rect.get_height()
#         ax.annotate('{}'.format(height),
#                     xy=(rect.get_x() + rect.get_width() / 2, height),
#                     xytext=(offset[xpos]*3, 3),  # use 3 points offset
#                     textcoords="offset points",  # in both directions
#                     ha=ha[xpos], va='bottom')


# autolabel(rects1, "left")
# autolabel(rects2, "right")

# fig.tight_layout()

# plt.show()


### Insight of the Image Analysis

The image datasets shows 1147 images in total and contains a propertionate amount of realistic image data and irrelevant image data. From the descriptive analysis of the dataset here is an overview of the image data distribution: 

+ Image links: contains 1143 real data and 5 null values 
+ Names: contains 205 null values and 943 real values

## Save image dataset

In [12]:
image_ds.to_csv("image_ds1.csv")