## Data Preparation
To begin this exploratory analysis, first use `matplotlib` to import libraries and define functions for plotting the data.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import os
import shutil
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input, decode_predictions
# from keras.applications.resnet import ResNet152
# from keras.applications.resnet import preprocess_input, decode_predictions
# from keras.applications.nasnet import NASNetLarge
# from keras.applications.nasnet import preprocess_input, decode_predictions
# from keras.applications.inception_resnet_v2 import InceptionResNetV2
# from keras.applications.inception_resnet_v2 import preprocess_input, decode_predictions
# from keras.applications.xception import Xception
# from keras.applications.xception import preprocess_input, decode_predictions
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
from collections import Counter
from keras.layers import Conv2D, GlobalAveragePooling2D, MaxPooling2D, Flatten, Dense, Dropout, InputLayer
from keras.models import Sequential
from keras import optimizers
from sklearn.preprocessing import LabelEncoder 
from keras.utils import to_categorical
import math

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
DATASET_PATH = "/kaggle/input/fashion-product-images-dataset/fashion-dataset/fashion-dataset/"
print(os.listdir(DATASET_PATH))

In [None]:
# df = pd.read_csv(DATASET_PATH + "styles.csv", nrows=20000, error_bad_lines=False)
df = pd.read_csv(DATASET_PATH + "styles.csv", error_bad_lines=False)
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.reset_index(drop=True)
df.head(10)
reduce_mem_usage(df)

In [None]:
import cv2
def plot_figures(figures, nrows = 1, ncols=1,figsize=(5, 5)):
    """Plot a dictionary of figures.

    Parameters
    ----------
    figures : <title, figure> dictionary
    ncols : number of columns of subplots wanted in the display
    nrows : number of rows of subplots wanted in the figure
    """

    fig, axeslist = plt.subplots(ncols=ncols, nrows=nrows,figsize=figsize)
    for ind,title in enumerate(figures):
        axeslist.ravel()[ind].imshow(cv2.cvtColor(figures[title], cv2.COLOR_BGR2RGB))
        axeslist.ravel()[ind].set_title(title)
        axeslist.ravel()[ind].set_axis_off()
    plt.tight_layout() # optional
    
def img_path(img):
    return DATASET_PATH+"images/"+img

def load_image(img, resized_fac = 1):
    img     = cv2.imread(img_path(img))
    w, h, _ = img.shape
    resized = cv2.resize(img, (int(h*resized_fac), int(w*resized_fac)), interpolation = cv2.INTER_AREA)
    return resized

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# generation of a dictionary of (title, images)
figures = {'im'+str(i): load_image(row.image) for i, row in df.sample(6).iterrows()}
# plot of the images in a figure, with 2 rows and 3 columns
plot_figures(figures, 2, 3)

The Dataset is made up of different items that can be found in a marketplace. The idea is to use embeddings to search for similarity and find similar items just using the image.

## Use Pre-Trained Model to Recommendation

In [None]:
import tensorflow as tf
import keras
from keras import Model
from keras.preprocessing import image
from keras.layers import GlobalMaxPooling2D
tf.__version__

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
# Input Shape
img_width, img_height, _ = 450, 600, 3 #load_image(df.iloc[0].image).shape

# Pre-Trained Model
base_model = InceptionV3(weights='imagenet', 
                      include_top=False, 
                      input_shape = (img_width, img_height, 3))
base_model.trainable = False

# Add Layer Embedding
model = keras.Sequential([
    base_model,
    GlobalMaxPooling2D()
])

model.summary()

In [None]:
it=0
def get_embedding(model, img_name):
    # Reshape
    global it
    it=it+1
    img = image.load_img(img_path(img_name), target_size=(img_width, img_height))
    # img to Array
    x   = image.img_to_array(img)
    # Expand Dim (1, w, h)
    x   = np.expand_dims(x, axis=0)
    # Pre process Input
    x   = preprocess_input(x)
    print(it)
    return model.predict(x).reshape(-1)


Get item Embedding

In [None]:
emb = get_embedding(model, df.iloc[0].image)
emb.shape

In [None]:
img_array = load_image(df.iloc[1].image)
plt.imshow(cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB))
print(img_array.shape)
print(emb)

In [None]:
df.shape

Get Embedding for all itens in dataset

In [None]:
%%time
#import swifter

# Parallel apply
df_sample      = df#.sample(10)
map_embeddings = df_sample['image'].apply(lambda img: get_embedding(model, img))
df_embs        = map_embeddings.apply(pd.Series)

print(df_embs.shape)
df_embs.head()
# reduce_mem_usage(df_embs)

## Compute Similarity Between Items

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
from sklearn.metrics.pairwise import pairwise_distances


# Calcule DIstance Matrix
# reduce_mem_usage(df_embs)
# cosine_sim = 1-pairwise_distances(df_embs, metric='cosine')
# cosine_sim[:4, :4]
# reduce_mem_usage(cosine_sim )

#### Recommender Similar Items

In [None]:
indices = pd.Series(range(len(df_embs)), index=df_embs.index)

In [None]:
from scipy.spatial.distance import cosine
# Function that get movie recommendations based on the cosine similarity score of movie genres
def get_recommender(idx, df, options, top_n = 5):
    sim_idx    = indices[idx]
    
    filteredIndices = set();
    
    if len(options) > 0:
        it = 0
        for column in options:
            if it == 0:
                filteredIndices = set(df.index[df[column]== options[column]].tolist())
            else:
                filteredIndices = filteredIndices & set(df.index[df[column]== options[column]].tolist())
            it = it + 1
            
        filteredIndices = list(filteredIndices)
#         print(len(filteredIndices) )
    else:
        filteredIndices = list(indices.index)
#     sim_scores =[];
#     for i in range(len(df_embs)):
#         sim_scores[i] = 1-cdist(df_embs[df_embs.index == sim_idx],df_embs[i], metric='cosine')
    cosine_sim = [ 1 - cosine(df_embs.iloc[sim_idx], df_embs.iloc[i]) for i in range(len(df_embs))]
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: ]
    
    idx_rec = []
    idx_sim = []
    count_sim = 0
    
    for i in sim_scores:
        if i[0] in filteredIndices:
            idx_rec.append(i[0])
            idx_sim.append(i[1])
            count_sim = count_sim + 1
            
            if count_sim >= top_n:
                break
            
    
    idx_rec = indices.iloc[idx_rec].index
    plt.imshow(cv2.cvtColor(load_image(df.iloc[idx].image), cv2.COLOR_BGR2RGB))
    
    
    if(len(idx_rec) <= 0):
        print('No matching item found for', options)
        return
    
    plot_row = math.ceil(len(idx_sim)/3)
    plot_col = min(3, len(idx_sim))
        # Plot
        #===================
        # generation of a dictionary of (title, images)
    figures = {'im'+str(i): load_image(row.image) for i, row in df.loc[idx_rec].iterrows()}
        # plot of the images in a figure, with 2 rows and 3 columns
    if len(figures):
        plot_figures(figures, plot_row, plot_col)

get_recommender(2992, df, {}, top_n = 6)

In [None]:
base_color_filter = df['baseColour']=='Green'

In [None]:
len(df_embs)

In [None]:
df.iloc[19572].gender 

In [None]:
# Idx Item to Recommender
idx_ref = 7831

# Recommendations
get_recommender(idx_ref, df, { }, top_n = 6)
# get_recommender(idx_ref, df, { "gender":df.iloc[idx_ref].gender }, top_n = 6)
# get_recommender(idx_ref, df, { "articleType":df.iloc[idx_ref].articleType }, top_n = 6)

In [None]:
df[df.index == 22]

In [None]:
df[df.subCategory == 'Shoe'][df.baseColour == 'Black']

**Search for items similar to the reference to recommend. Apparently it's working!**

In [None]:
idx_ref = 8921

# Recommendations
idx_rec, idx_sim = get_recommender(idx_ref, df, {}, top_n = 6)

# Plot
#===================
plt.imshow(cv2.cvtColor(load_image(df.iloc[idx_ref].image), cv2.COLOR_BGR2RGB))

# generation of a dictionary of (title, images)
figures = {'im'+str(i): load_image(row.image) for i, row in df.loc[idx_rec].iterrows()}
# plot of the images in a figure, with 2 rows and 3 columns
plot_figures(figures, 2, 3)

In [None]:
idx_ref = 987

# Recommendations
idx_rec, idx_sim = get_recommender(idx_ref, df, {}, top_n = 6)

# Plot
#===================
plt.imshow(cv2.cvtColor(load_image(df.iloc[idx_ref].image), cv2.COLOR_BGR2RGB))

# generation of a dictionary of (title, images)
figures = {'im'+str(i): load_image(row.image) for i, row in df.loc[idx_rec].iterrows()}
# plot of the images in a figure, with 2 rows and 3 columns
plot_figures(figures, 2, 3)

In [None]:
idx_ref = 3524

# Recommendations
idx_rec, idx_sim = get_recommender(idx_ref, df, {}, top_n = 6)

# Plot
#===================
plt.imshow(cv2.cvtColor(load_image(df.iloc[idx_ref].image), cv2.COLOR_BGR2RGB))

# generation of a dictionary of (title, images)
figures = {'im'+str(i): load_image(row.image) for i, row in df.loc[idx_rec].iterrows()}
# plot of the images in a figure, with 2 rows and 3 columns
plot_figures(figures, 2, 3)