# Featurize Images for Image Similarity Model

This notebook will do the following:

For each image in the provided Azure Blob container:
    - download the image
    - resize the image to the pre-defined img_width & img_height
    - Featurize the image using the Keras pre-trained ResNet50 model trained on imagenet
    - save the featurized images to a preprocessedimages.pkl file in the provide data directory
    - save a corresponding targets.pkl file with a table of the [name, url] for each image 


In [None]:
import pickle
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from urllib.request import urlopen
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input
from azure.storage.blob import BlockBlobService, PublicAccess

# Define Constants

In [None]:
debug = False #used to turn print statements on/off

url_base = '' #Root folder url to where the images are stored in Azure Blob Storage
blob_account_name = '' #Azure storage account name
blob_sas_token='' # SAS token to access the blob
blob_container = '' #container where the images are located
blob_prefix = '' #path to any sub-folders within the container 



sample_length = 10 #number of images to pre-process
print_every = 100 #print update every print_every iterations
img_width = 512 #input sizes images will be re-szied to
img_height = 512

filename_root = '' #root folder filepath. All data from this notebookwill be saved to this directory
pre_processed_filename = filename_root + 'preprocessedimages' #name of the np array of size (sample_length, img_length) will be saved. These are the featurized versions of the images
targets_filename = filename_root + 'targets' # helper table that tracks the name & URL for each row
failed_filename = filename_root + 'failed' # save and record any failures
total_processed = filename_root + 'total_i.pkl' # save the total number


#check that the all the variables have been set
assert url_base != '', 'Please provide the root url for all the images. Example: if all images are at https://test.com/image1.jpg, provide https://test.com/'
assert blob_account_name != '', 'Please provide the Azure storage account name where the images are stored'
assert blob_sas_token != '', 'Please provide the SAS token for accessing the blob account'
assert blob_container != '', 'Please provide the container name where the images are stored'
assert blob_prefix !='', 'Please provide any additional path compnoents for the imates. Example if the iamges are stored in containername/data/images the prefix is data/images'
assert filename_root != '', 'Please provide a filepath for where the data should be saved. Example: /data/'

# Initialize Resnet50 Model

In [None]:
#initialize model
img_length = 2048 #size of output from model
keras_model = ResNet50(input_shape=[img_width,img_height,3], 
                     weights='imagenet', 
                     include_top=False, 
                     pooling='avg')

# Featurize images with ResNet50 Model & Save

In [None]:
import time
start = time.time()

#Connect to the blob
block_blob_service = BlockBlobService(account_name=blob_account_name, sas_token=blob_sas_token)
files = block_blob_service.list_blobs(blob_container, prefix=blob_prefix)

targets = []
failed_idx = []
preprocessed_images = np.zeros((sample_length,img_length))


#Loop through all files in the blob container location provided. We will download each iamge & run it through the ResNet50 model.
i = 0
for file in files:
    name = file.name.rsplit('/',1)[-1].split('.')[0]  #this is the object ID
    url = url_base + name + '.jpg'
    try:
        #download image
        with urlopen(url) as file:
            img = Image.open(file)
            #non RGB images won't have the right number of channels
            if img.mode != 'RGB': 
                img = img.convert('RGB') 
                
        #re-size, expand dims and run through the ResNet50 model
        img = np.array(img.resize((img_width, img_height)))
        img = preprocess_input(np.expand_dims(img, axis=0).astype(np.float))
        img = keras_model.predict(img)
        #add to master table
        preprocessed_images[i,:] = img
        targets.append([name, url])
    except Exception as e:
        print('failed to process: %s' % url)
        print('iteration %d' % i)
        print(e)
        failed_idx.append(i)
    
    if i%print_every == 0:
        print('completed iteration: %d' % i)
        print('saving model to file ')
        pickle.dump(preprocessed_images, open(pre_processed_filename + str(i) +'.pkl', 'wb'))
        pickle.dump(targets,open(targets_filename + str(i) +'.pkl','wb'))
        pickle.dump(failed_idx, open(failed_filename + str(i) + '.pkl','wb'))
        current_time = time.time()
        print('elapsed time %0.2f min' % ((current_time - start)/60))
    
    i += 1
    if i>= sample_length:
        print('Reached the end, breaking-loop')
        break
    if debug:
        print(name)
        print(url)

print('completed processing, saving files')
pickle.dump(preprocessed_images, open(pre_processed_filename + '.pkl', 'wb'))
pickle.dump(targets,open(targets_filename + '.pkl','wb'))
pickle.dump(failed_idx, open(failed_filename + '.pkl','wb'))
pickle.dump(i,open(total_processed,'wb'))

end = time.time()

print('elapsed time %0.2f' % ((current_time - start)/60))
print('total processed: %d' % i)
