# Task 0 - Load dataset 

In [None]:
import numpy as np
# Use numpy array as a read-only table matrix
# Use lists to do processing (find, sort..)
location = 'attributes.csv'
tmp = np.genfromtxt(location, delimiter=',', dtype=np.string_) #load csv
attributes = tmp[1:,:] #remove headers
attributes_headers = tmp[0]
attributesIds = attributes[:,0].tolist() 
attributesNames = attributes[:,1].tolist() 
print(attributes.shape)
print(attributes_headers)
print(attributes[545])

In [None]:
location = 'products.csv'
tmp = np.genfromtxt(location, delimiter=',', dtype=np.string_, comments=None, usecols=np.arange(0,12)) #added 2 conditions for import to work 
products = tmp[1:,:] 
products_headers = tmp[0]
productsIds = products[:,0].tolist()
print(products.shape)
print(products_headers)
print(products[545])

In [None]:
import os
imageFullNames = os.listdir('images')
imageIds = [a.split("_", 1)[0] for a in imageFullNames] #remove file name after ProductId
print(imageFullNames[56])
print(imageIds[56])
print(len(imageFullNames))
print(len(imageIds))

# Task 1 - Get comfortable with manipulating the data

In [None]:
from collections import Counter
print(any(count > 3 for count in Counter(attributesIds).itervalues()) ) 
print(any(count > 4 for count in Counter(attributesIds).itervalues()) ) 
print(any(count > 1 for count in Counter(productsIds).itervalues()) ) 
print(any(count > 1 for count in Counter(imageIds).itervalues()) )

In [None]:
from IPython.display import Image
# Functions to manipulate the data - Better to pre-store everything with a dictionary - Complexity/Computation tradeoff (to do later)
# Get image file name from a ProductId ->  'images/' + imageFullNames[ imageIds.index( myProductId ) ] 
# Get attributes from a ProductId -> attributes[ attributesIds.index( myProductId ) ] 
# Get description from a ProductId - > products[ productsIds.index( myProductId ) ] 
myProductId = attributesIds[587]
print( attributes[ attributesIds.index( myProductId ) ] )
print( products[ productsIds.index( myProductId ) ] )
Image( 'images/' + imageFullNames[ imageIds.index( myProductId ) ] )

# Task 2 - Generate attributes for products missing some (supervised learning)

# A - Build the train, test, application datasets

In [None]:
attributesNamesUnique, counts = np.unique(attributesNames, return_counts=True) #change to 2 to get all sub-attributes
dict(zip(attributesNamesUnique, counts))

In [None]:
#Find products with an image + attribute -> build dataset from them
attributesIdsUnique = list(set(attributesIds)) # (7362) List of ProductIds with at least one attribute and no ProductIds repetition
#attributesIdsUniqueWithImage = [x for x in attributesIdsUnique if x in imageIds] # (7251) #check available image (not for 111) 
attributesIdsWithImage = [x for x in attributesIds if x in imageIds] # (13271)

In [None]:
#Find products with an image + no attribute + add their filename -> apply the algorithm on those products
productsIdsNoAttribute = [x for x in productsIds if x not in attributesIdsUnique] #(5269)
productsIdsNoAttributeWithImage = [x for x in productsIdsNoAttribute if x in imageIds] #(5185)
productsIdsNoAttributeWithImageAddress = [ imageFullNames[ imageIds.index( x ) ] for x in productsIdsNoAttributeWithImage]

In [None]:
# Need to build the dataset -> For each of the 24 attributes, list all ProductIds associated & possessing an image

from collections import defaultdict

dico = defaultdict(list) # dico: key is an attribute, value is a list of ProductId with this attribute & an image
dicoAddress = defaultdict(list) #same but with image file names instead of just ProductId

for (index, myProductId) in enumerate(attributesIds):
    if myProductId in attributesIdsWithImage: 
        key = attributesNames[index]
        dico[key].append(myProductId)
        filename = imageFullNames[ imageIds.index( myProductId ) ]
        dicoAddress[key].append(filename)
    
#ctr = sum(map(len, dico.values())); print(ctr) #used to count number of items in dico (13271)
#length_dico = {key: len(value) for key, value in dico.items()}; print(length_dict) #create another dictionary containing length of lists of values

dicoSmall = defaultdict(list) # Unbalanced training issue and high training time so take only first 10 images of each 24 classes.
for key in dicoAddress:
    tmp = dicoAddress[key]
    dicoSmall[key] = tmp[:10]

In [None]:
dicoSmallTrain = defaultdict(list) # split train:test 7:3
dicoSmallTest = defaultdict(list)
for key in  dicoSmall:
    tmp =  dicoSmall[key]
    dicoSmallTrain[key] = (tmp[:7])
    dicoSmallTest[key] = (tmp[7:10])
#print(dicoSmall['Denim Fit']);print(dicoSmallTrain['Denim Fit']);print(dicoSmallTest['Denim Fit'])

In [None]:
indices = [index for (index, x) in enumerate(attributesNames) if x == 'Sleeve Length'] # (3056) Get indices of products with Sleeve length attibute
tmp = [attributesIds[x] for x in indices] #Get ProductId of those indices
indices = [ indices[index] for (index,x) in enumerate(tmp) if x in imageIds ] # (3009) Remove indices (products) with no image available
a = attributes[indices,0] #ProductIds
b = attributes[indices,2] #sub-attributes
c =  np.asarray( [ imageFullNames[ imageIds.index( myProductId ) ] for myProductId in a] ) #image filenames
dataset = np.column_stack((a,b, c)) #Create a numpy array with ProductId | sub-attribute (label) | image filename
classesList = np.unique(dataset[:,1]) # (7) number of sub-attributes 
print(classesList)

In [None]:
import os

for subattribute in classesList:
    directory = 'data/train/' + subattribute
    if not os.path.exists(directory):
        os.makedirs(directory)
    directory = 'data/validation/' + subattribute
    if not os.path.exists(directory):
        os.makedirs(directory)

#problem with 3/4 sleeves, tell the engineering team -> changed to 3_4

from shutil import copyfile
for x in dataset:
    src = 'images/' + x[2]
    dst = 'data/train/' + x[1] + '/' + x[2]
    if not os.path.exists(dst):
        copyfile(src, dst)

# B - Run the algorithm on the datasets

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(340, 255, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# the model so far outputs 3D feature maps (height, width, features)

In [None]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
#Data pre-processing, augmentation, dimensionality reduction

batch_size = 16
target_size = (340, 255) #can reduce dimension here
color_mode = 'rgb' #can reduce dimension here

train_datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

# this is the augmentation configuration we will use for testing: only rescaling
test_datagen = ImageDataGenerator(rescale=1./255)

# this is a generator that will read pictures found in
# subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow_from_directory(
        'data/train',  # this is the target directory
        target_size=target_size,  # all images will be resized to target_size
        batch_size = batch_size,
        class_mode='binary',
        color_mode=color_mode)  # since we use binary_crossentropy loss, we need binary labels

# this is a similar generator, for validation data
validation_generator = test_datagen.flow_from_directory(
        'data/validation',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='binary',
        color_mode=color_mode)

In [None]:
# need to make the above work by preparing datasets in right directory + expanding to 7 classes + copying full code available on github