## Image Transformation

In [32]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
from PIL import Image
import random

from image_processing_tools import construct_feature, data_loading

#### Approach

+ Data Loading: Images, Labels
+ Data Transformation
+ Data Preprocessing 
+ Data Storage 

In [2]:
pd.set_option("display.max_rows", 100)

## Data Loading

In [3]:
# define constants --> loading content for data transformation
text_file = "mobile_labels.txt"
image_path = "D:\\Machine_Learning\\Portfolio_Project_Machine_Learning\\Mobile_Phone_Recognition\\mobile_phone_images\\mobile_images"

# load images and labels 
mobile_images = os.listdir(image_path)
mobile_labels = data_loading(directory=text_file, content_type="Mobile Labels")

In [4]:
print(f"Mobile Labels (Size): {mobile_labels.shape[0]}")
print(f"{mobile_labels[:5]} \n")
print(f"Mobile Images (Size): {len(mobile_images)}")
print(f"{mobile_images[:5]}")

Mobile Labels (Size): 800
['Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RAM 128GB Blue Black EU\n'
 'Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 128GB Awesome Lemon EU\n'
 'Motorola Moto G54 256GB Blue 5G Android Smartphone 6.5 Inches 12GB RAM 16MP\n'
 'Xiaomi 14 Ultra 5G 16GB/512GB White (White) Dual SIM\n'
 'Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lavender) Dual SIM G781B\n'] 

Mobile Images (Size): 800
['image_000.jpg', 'image_001.jpg', 'image_002.jpg', 'image_003.jpg', 'image_004.jpg']


## Data Transformation

Approach:
+ convert labels and images into frames
+ extract content from labels (check requirement.txt)
+ create feature with brand names seperately

In [5]:
# convert labels and image list into frames
frame1 = construct_feature("Labels", mobile_labels)
frame2 = construct_feature("Image_file", mobile_images)

# concatenate both frames
mobile_df = pd.concat([frame1, frame2], axis=1)

In [6]:
mobile_df.head()

Unnamed: 0,Labels,Image_file
0,Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RA...,image_000.jpg
1,Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 12...,image_001.jpg
2,Motorola Moto G54 256GB Blue 5G Android Smartp...,image_002.jpg
3,Xiaomi 14 Ultra 5G 16GB/512GB White (White) Du...,image_003.jpg
4,Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lav...,image_004.jpg


In [7]:
mobile_df["Labels"].unique()

array(['Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RAM 128GB Blue Black EU\n',
       'Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 128GB Awesome Lemon EU\n',
       'Motorola Moto G54 256GB Blue 5G Android Smartphone 6.5 Inches 12GB RAM 16MP\n',
       'Xiaomi 14 Ultra 5G 16GB/512GB White (White) Dual SIM\n',
       'Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lavender) Dual SIM G781B\n',
       'Samsung Galaxy S22+ 5G 8GB/128GB Black (Phantom Black) Dual SIM SM-S906\n',
       'Samsung Galaxy A55 Smartphone 128GB 5G Midnight Blue\n',
       'Samsung Smartphone Galaxy S20, 5G, 12GB + 128GB, Grey\n',
       'Samsung Galaxy A03 Smartphone 64GB, 4GB RAM, Dual SIM, Blue\n',
       'SAMSUNG S20 ULTRA 128GB Phone - Cosmic gray\n',
       'SAMSUNG - Smartphone Galaxy A14 5G Silver 6.6 IN 4GB 64GB Android 13\n',
       'Samsung Galaxy Note Pack 20 5G - 6.7" Display (AMOLED FHD+, 8GB RAM + 256GB Storage, Quad Rear Camera, 4300mAh Fast Charge 25W) Mystic Green [ES Version]\n',
       'Smartphone

## Data Preprocessing

After constructing a dataframe with labels and images, we focus on extracting the name of the mobile brand from the label and combine them in multiple frames. The target = "brand name"

In [8]:
# extract label data
labels = mobile_df["Labels"].apply(lambda text: text.capitalize())

# extract brands from the label data 
mobile_df["Company"] = mobile_df["Labels"].str.split(" ", expand=True).loc[:, 0]


In [9]:
# extract the model from the label data 
phone_model = mobile_df["Labels"].str.split(" ", expand=True).loc[:,1:2]
mobile_df["Model"] = phone_model[[1,2]].agg(lambda x: " ".join(x), axis=1)

In [10]:
# reorganise the columns
reorganised_columns = ["Image_file", "Labels", "Model", "Company"]
mobile_df = mobile_df[reorganised_columns]
mobile_df.head()

Unnamed: 0,Image_file,Labels,Model,Company
0,image_000.jpg,Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RA...,SM-A156B Galaxy,Samsung
1,image_001.jpg,Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 12...,SM-A556B Galaxy,Samsung
2,image_002.jpg,Motorola Moto G54 256GB Blue 5G Android Smartp...,Moto G54,Motorola
3,image_003.jpg,Xiaomi 14 Ultra 5G 16GB/512GB White (White) Du...,14 Ultra,Xiaomi
4,image_004.jpg,Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lav...,Galaxy S20,Samsung


In [11]:
mobile_df.isnull().sum()

Image_file    0
Labels        0
Model         0
Company       0
dtype: int64

In [12]:
mobile_df.iloc[27]

Image_file                                        image_027.jpg
Labels        Samsung Galaxy A52 5G Smartphone 128GB, 6GB RA...
Model                                                Galaxy A52
Company                                                 Samsung
Name: 27, dtype: object

In [13]:
# check for inconsistencies in the data 
mobile_df["Company"].unique()

array(['Samsung', 'Motorola', 'Xiaomi', 'SAMSUNG', 'Smartphone', 'XIAOMI',
       'Nokia', 'Redmi', 'Apple', 'NOKIA', 'POCO', 'Galaxy', 'Sam', 'ATI',
       'Moto', 'Telecom', 'XIA', 'APPLE', 'iPhone', 'Poco', 'OnePlus',
       'Oneplus', 'Free', 'apple', 'Unlocked', 'SMARTPHONE', 'M4',
       'GenÃ©rico'], dtype=object)

## Data Preprocessing: Company Data

Inconsistent company names
+ names with half words (data standardisation errors): Moto, ATI, Telecom, XIA, M4, Sam, SAMSUNG, XIAOMI, NOKIA, POCO
+ inconsistent names: Free, Unlocked
+ generic entries: 'GenÃ©rico'

In [14]:
# capitalise the company names
mobile_df["Company"] = mobile_df["Company"].apply(lambda text: text.capitalize() if text == text.upper() else text)

Company Brands:
+ Samsung: Sam, SAMSUNG, Galaxy
+ Apple: iPhone, APPLE, apple
+ OnePlus: Oneplus, oneplus
+ Poco: poco, POCO
+ Nokia: NOKIA, 
+ Xiaomi: XIAOMI, XIA
+ Motorola: Moto, MOTOROLA

other words: Smartphone, SMARTPHONE

In [15]:
# generic word
generics = ['GenÃ©rico']

# replace irrelevant words by company names: Samsung
mobile_df["Company"] = mobile_df["Company"].str.replace("Galaxy", "Samsung")
mobile_df["Company"] = mobile_df["Company"].apply(lambda text: text + 'sung' if text == 'Sam' else text)
mobile_df["Company"] = mobile_df["Company"].str.replace("Telecom", "Samsung")
mobile_df["Company"] = mobile_df["Company"].str.replace(generics[0], "Samsung")

In [16]:
# replace irrelevant words by company names: Apple
apple = ["apple", "iPhone"]
mobile_df["Company"] = mobile_df["Company"].str.replace(apple[0], "Apple")
mobile_df["Company"] = mobile_df["Company"].str.replace(apple[1], "Apple")

In [17]:
# replace irrelevant words by company names: Xiaomi
xiaomi = ["Smartphone","Ati",'Xia']
mobile_df["Company"] = mobile_df["Company"].str.replace(xiaomi[0], "Xiaomi")
mobile_df["Company"] = mobile_df["Company"].str.replace(xiaomi[1], "Xiaomi")
mobile_df["Company"] = mobile_df["Company"].apply(lambda text: text + "omi" if text == xiaomi[2] else text)

In [18]:
# replace irrelevant words by company names: Nokia, Microsoft, Poco, Motorola
mobile_df["Company"] = mobile_df["Company"].apply(lambda text: text + 'rola' if text == 'Moto' else text)
mobile_df["Company"] = mobile_df["Company"].str.replace("Free", "Microsoft")
mobile_df["Company"] = mobile_df["Company"].str.replace("Unlocked", "Nokia")
mobile_df["Company"] = mobile_df["Company"].str.replace("M4", "Poco")

In [19]:
# replace irrelevent words by company names: OnePlus
mobile_df["Company"] = mobile_df["Company"].str.replace("Oneplus", "OnePlus")

In [20]:
mobile_df["Company"].value_counts().tail()
mobile_df["Company"].value_counts()

Company
Samsung      345
Xiaomi       167
Apple        105
Nokia        103
Motorola      30
OnePlus       26
Redmi         13
Poco          10
Microsoft      1
Name: count, dtype: int64

In [21]:
mobile_df["Company"].unique().shape

(9,)

+ Company names: Samsung, Xiaomi, Apple, Nokia, Motorola, OnePlus, Redmi, Poco, Microsoft
+ Size: 9 names

In [22]:
company_unknown = mobile_df[mobile_df["Company"] == 'Oneplus']
company_unknown

Unnamed: 0,Image_file,Labels,Model,Company


In [23]:
mobile_df[["Image_file","Labels","Company"]]

Unnamed: 0,Image_file,Labels,Company
0,image_000.jpg,Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RA...,Samsung
1,image_001.jpg,Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 12...,Samsung
2,image_002.jpg,Motorola Moto G54 256GB Blue 5G Android Smartp...,Motorola
3,image_003.jpg,Xiaomi 14 Ultra 5G 16GB/512GB White (White) Du...,Xiaomi
4,image_004.jpg,Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lav...,Samsung
...,...,...,...
795,image_795.jpg,Nokia 6303 Classic Steel 6303 (without Simlock...,Nokia
796,image_796.jpg,"GenÃ©rico Galaxy S23 Ultra, 512GB Android Smar...",Samsung
797,image_797.jpg,Nokia 7230 Mobile Phone grey\n,Nokia
798,image_798.jpg,"Xiaomi Redmi 10 (2022) - Smartphone 64GB, 4GB ...",Xiaomi


## Data Preprocessing: Model Data

Preprocessing model data from company
+ Samsung
+ Xiaomi 
+ Apple
+ Nokia
+ Motorola
+ OnePlus
+ Redmi 
+ Poco 
+ Microsoft

In [24]:
mobile_df.head()

Unnamed: 0,Image_file,Labels,Model,Company
0,image_000.jpg,Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RA...,SM-A156B Galaxy,Samsung
1,image_001.jpg,Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 12...,SM-A556B Galaxy,Samsung
2,image_002.jpg,Motorola Moto G54 256GB Blue 5G Android Smartp...,Moto G54,Motorola
3,image_003.jpg,Xiaomi 14 Ultra 5G 16GB/512GB White (White) Du...,14 Ultra,Xiaomi
4,image_004.jpg,Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lav...,Galaxy S20,Samsung


In [25]:
# identify the model data 
# mobile_df["Model"].value_counts()
mobile_df["Company"].value_counts()

Company
Samsung      345
Xiaomi       167
Apple        105
Nokia        103
Motorola      30
OnePlus       26
Redmi         13
Poco          10
Microsoft      1
Name: count, dtype: int64

In [26]:
# create constants for preprocessing model
company_names = mobile_df["Company"].unique().tolist()
company_names = sorted(company_names)

# create company name constants using for preprocessing
apple = mobile_df["Company"] == company_names[0]
microsoft = mobile_df["Company"] == company_names[1]
motorola = mobile_df["Company"] == company_names[2]
nokia = mobile_df["Company"] == company_names[3]
oneplus = mobile_df["Company"] == company_names[4]
poco = mobile_df["Company"] == company_names[5]
redmi = mobile_df["Company"] == company_names[6]
samsung = mobile_df["Company"] == company_names[7]
xiaomi = mobile_df["Company"] == company_names[8]

Preprocessing Model Data: Samsung

Brand: Samsung
Model: Galaxy, A(Number), 

Version of Samsung Galaxy
+	Samsung Galaxy S (Super Smart) series
+ Samsung Galaxy Z (Zen) series
+ Samsung Galaxy A (Alpha) series
+ Samsung Galaxy M (Millennial) series
+ Samsung Galaxy F (Fun/Flipkart) series
+ Samsung Galaxy XCover series

In [27]:
samsung_df = mobile_df[samsung].drop(columns=["Image_file"], axis=1)
samsung_df[samsung_df["Labels"].str.contains("Galaxy S")]

Unnamed: 0,Labels,Model,Company
4,Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lav...,Galaxy S20,Samsung
5,Samsung Galaxy S22+ 5G 8GB/128GB Black (Phanto...,Galaxy S22+,Samsung
7,"Samsung Smartphone Galaxy S20, 5G, 12GB + 128G...",Smartphone Galaxy,Samsung
17,Samsung Galaxy S20 FE 5G 128GB/6GB G781 Dual S...,Galaxy S20,Samsung
19,"SAMSUNG Galaxy S20 FE 5G - Smartphone 128GB, 6...",Galaxy S20,Samsung
...,...,...,...
785,Samsung Galaxy S9 Smartphone (5.8 inches (14.8...,Galaxy S9,Samsung
789,Samsung Galaxy S7 Unlocked Smartphone 4G (Scre...,Galaxy S7,Samsung
791,Samsung Galaxy S9 64 GB (Single SIM) - Black -...,Galaxy S9,Samsung
793,Samsung Galaxy S10+ Smartphone (16.2 cm (6.4 I...,Galaxy S10+,Samsung


Remark: preprocessing the model that would require more time to accomplish. Therefore, we will focus on training the model with the brand labels from the company data

## Data Storage 

In [28]:
# save this dataset as csv file
filename = "image_dataset.csv"
mobile_df.to_csv(filename)

In [29]:
df = pd.read_csv(filename, index_col=0)
df.head()

Unnamed: 0,Image_file,Labels,Model,Company
0,image_000.jpg,Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RA...,SM-A156B Galaxy,Samsung
1,image_001.jpg,Samsung SM-A556B Galaxy A55 5G Dual SIM 8GB 12...,SM-A556B Galaxy,Samsung
2,image_002.jpg,Motorola Moto G54 256GB Blue 5G Android Smartp...,Moto G54,Motorola
3,image_003.jpg,Xiaomi 14 Ultra 5G 16GB/512GB White (White) Du...,14 Ultra,Xiaomi
4,image_004.jpg,Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lav...,Galaxy S20,Samsung


In [31]:
for image, label in mobile_df[["Image_file", "Company"]].values:
  print(f"{image} -- {label}")

image_000.jpg -- Samsung
image_001.jpg -- Samsung
image_002.jpg -- Motorola
image_003.jpg -- Xiaomi
image_004.jpg -- Samsung
image_005.jpg -- Samsung
image_006.jpg -- Samsung
image_007.jpg -- Samsung
image_008.jpg -- Samsung
image_009.jpg -- Samsung
image_010.jpg -- Samsung
image_011.jpg -- Samsung
image_012.jpg -- Xiaomi
image_013.jpg -- Xiaomi
image_014.jpg -- Samsung
image_015.jpg -- Samsung
image_016.jpg -- Samsung
image_017.jpg -- Samsung
image_018.jpg -- Nokia
image_019.jpg -- Samsung
image_020.jpg -- Samsung
image_021.jpg -- Xiaomi
image_022.jpg -- Samsung
image_023.jpg -- Samsung
image_024.jpg -- Samsung
image_025.jpg -- Samsung
image_026.jpg -- Xiaomi
image_027.jpg -- Samsung
image_028.jpg -- Motorola
image_029.jpg -- Samsung
image_030.jpg -- Samsung
image_031.jpg -- Motorola
image_032.jpg -- Motorola
image_033.jpg -- Samsung
image_034.jpg -- Xiaomi
image_035.jpg -- Samsung
image_036.jpg -- Nokia
image_037.jpg -- Samsung
image_038.jpg -- Xiaomi
image_039.jpg -- Samsung
image_0

In [None]:
# Load the dataset
image_df = pd.read_csv("image_dataset.csv")
image_labels = image_df[["Image_file", "Company"]].values  # Get image-label pairs

# Shuffle the dataset for randomness
random.shuffle(image_labels)

# Split ratios
train_ratio = 0.8
test_ratio = 0.2  # Optionally include validation from test later
total_images = len(image_labels)

train_size = int(total_images * train_ratio)
test_size = int(total_images * test_ratio)
# test_size = total_images - train_size

# Split the dataset into train and test
train_data = image_labels[:train_size]
test_data = image_labels[train_size:]

In [37]:
train_data.shape
test_data

array([['image_382.jpg', 'Samsung'],
       ['image_400.jpg', 'Nokia'],
       ['image_197.jpg', 'Samsung'],
       ['image_430.jpg', 'Xiaomi'],
       ['image_093.jpg', 'Xiaomi'],
       ['image_608.jpg', 'Apple'],
       ['image_457.jpg', 'Xiaomi'],
       ['image_488.jpg', 'Samsung'],
       ['image_566.jpg', 'Apple'],
       ['image_170.jpg', 'Samsung'],
       ['image_213.jpg', 'Nokia'],
       ['image_138.jpg', 'Samsung'],
       ['image_615.jpg', 'Xiaomi'],
       ['image_499.jpg', 'Motorola'],
       ['image_142.jpg', 'Samsung'],
       ['image_530.jpg', 'Samsung'],
       ['image_010.jpg', 'Samsung'],
       ['image_271.jpg', 'Apple'],
       ['image_219.jpg', 'Samsung'],
       ['image_383.jpg', 'Apple'],
       ['image_273.jpg', 'Apple'],
       ['image_158.jpg', 'Poco'],
       ['image_331.jpg', 'Nokia'],
       ['image_254.jpg', 'Samsung'],
       ['image_644.jpg', 'Xiaomi'],
       ['image_159.jpg', 'Samsung'],
       ['image_341.jpg', 'Nokia'],
       ['image_521.jpg', '