# Certificate Project - 1 : [Fashion Images Classification](https://www.kaggle.com/microdegree/project-dataset-fashion-images)

# Replace all '????' with correct function name/values.

# Import libraries:

In [1]:
import os
import glob
import numpy as np

from PIL import Image,ImageOps
import matplotlib.pyplot as plt
%matplotlib inline

# List out directories

In [2]:
base_dir = '../input/project-dataset-fashion-images/Fashion_Images_Trimmed/fashion_dataset_2k'
directory = os.listdir(base_dir)
directory

['test', 'train']

# Prepare training & testing directory paths

In [3]:
train_dir = base_dir + '/train/'
test_dir = base_dir + '/test/'

# Prepare CLASSES array

In [4]:
# Hint: 
#   You are free to use api available in 'os' library, which returns array of directories inside given path.
#   Or, you can manually prepare the array, as CLASSES = ['folder name1', 'folder name 2' ...]
CLASSES = os.listdir(train_dir)
CLASSES

['Apparel', 'Footwear', 'Personal Care', 'Accessories']

# Total no. of images per class in training dataset

In [5]:
# TRAINING DIR: For each class, lets see how many images are there..
for imgType in CLASSES:
    imgTypePath = train_dir + "/" + imgType + "/"
    print("CLASS: " + imgType + ", Total images: " + str(len(os.listdir(imgTypePath)))) 

CLASS: Apparel, Total images: 2000
CLASS: Footwear, Total images: 2000
CLASS: Personal Care, Total images: 2000
CLASS: Accessories, Total images: 2000


# Total no. of images per class in testing dataset

In [6]:
# TESTING DIR: For each class, lets see how many images are there..
for imgType in CLASSES:
    imgTypePath = test_dir + "/" + imgType + "/"
    print("CLASS: " + imgType + ", Total images: " + str(len(os.listdir(imgTypePath))))

CLASS: Apparel, Total images: 400
CLASS: Footwear, Total images: 400
CLASS: Personal Care, Total images: 400
CLASS: Accessories, Total images: 400


## Pipeline helper functions

In [7]:
def convert_to_grayscale(img):
    #Convert to grayscale
    return ImageOps.grayscale(img) #Return gray scale image object using ImageOps.

def reshape_img(img, target_size=(150,150)):
    #Reshape any image to a fixed shape
    return img.resize(target_size, Image.ANTIALIAS) #Hint: Use resize() API on image object and pass target size param.

def display_numpy_img(np_img, img_name="Transformed image"):
    plt.figure(figsize = (6,6))
    plt.imshow(np_img, cmap='gray')
    plt.title(img_name)
    
def transform_image(img_file_path):
    img_obj = Image.open(img_file_path)
    #print(img_obj.format)
    #print(np.array(img_obj).shape)
    #Perform transformations in series
    img_obj = convert_to_grayscale(img_obj)
    img_obj = reshape_img(img_obj, (150,150))
    np_arr_img = np.array(img_obj)
    return np_arr_img

def load_dir_to_numpy(dir_path, maxImgs=1500):
    file_list = glob.glob(dir_path+'/*')
    imgs = []
    #Load image by image
    imgCount=0
    for fname in file_list:
        if imgCount>=maxImgs:
            break
        img_np = transform_image(fname)
        imgs.append(img_np)
        imgCount = imgCount + 1
    np_imgs = np.array(imgs)
    return np_imgs

def prepare_image_data(dir_path, MAX_IMGS):
    imgs_arr_X = []
    data_arr_y = []
    classIdx = 0;
    for imgType in CLASSES:
        IMG_DIR = dir_path + "/" + imgType + "/"
        #  print("IMG_DIR: " + IMG_DIR)
        imgs_arr = load_dir_to_numpy(IMG_DIR, MAX_IMGS)
        #print(imgType + ": " + str(imgs_arr.shape))
        imgs_arr_X.extend(imgs_arr)
        data_y = np.full((imgs_arr.shape[0],1), classIdx)
        data_arr_y.extend(data_y)
        classIdx += 1
    np_img_arr_X = np.array(imgs_arr_X)
    np_data_arr_y = np.array(data_arr_y)
    return np_img_arr_X,np_data_arr_y

# Prepare training dataset

In [8]:
train_np_x,train_np_y = prepare_image_data(train_dir, 500)
print('train_np_x.shape:', train_np_x.shape)
print('train_np_y.shape:', train_np_y.shape)

train_np_x.shape: (2000, 150, 150)
train_np_y.shape: (2000, 1)


# Flatten out the 2D image data into 1D vector

In [9]:
# Flatten out the 2D image data into 1D vector
train_size = train_np_x.shape[0]
train_np_x = train_np_x.reshape((train_size, -1))
print('After reshaping, train_np_x.shape:', train_np_x.shape)

After reshaping, train_np_x.shape: (2000, 22500)


# Import & Prepare the model object

In [10]:
# Import a model
from sklearn.linear_model import SGDClassifier
model = SGDClassifier()

#Hint: Any algorithm say sklearn.linear_model.SGDClassifier OR sklearn.tree.DecisionTreeClassifier() etc..

# Train the model

In [11]:
model.fit(train_np_x,train_np_y.reshape(-1))

SGDClassifier()

# Prepare testing data

In [12]:
test_np_x,test_np_y = prepare_image_data(test_dir, 200)

test_size = test_np_x.shape[0]
test_np_x = test_np_x.reshape((test_size, -1))
print('Test shape:', test_np_x.shape)

Test shape: (800, 22500)


# Predict using testing data

In [13]:
# Get predicted values for 'test_np_x' using trained 'model' 
predicted_y = model.predict(test_np_x)

# What's the trained model accuracy on test data?

In [14]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
actual_y = test_np_y.reshape(-1)
print('Accuracy Score:', accuracy_score(actual_y, predicted_y))

Accuracy Score: 0.90875


# Model training performance report

In [15]:
print(classification_report(actual_y, predicted_y))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89       200
           1       0.99      0.94      0.96       200
           2       0.91      0.95      0.93       200
           3       0.92      0.79      0.85       200

    accuracy                           0.91       800
   macro avg       0.91      0.91      0.91       800
weighted avg       0.91      0.91      0.91       800



# Confusion matrix

In [16]:
print(confusion_matrix(actual_y, predicted_y))

[[192   0   4   4]
 [  2 187   4   7]
 [  7   0 191   2]
 [ 30   1  12 157]]
