# Featurize Images for Image Similarity Model

This notebook will do the following:

For each image in the provided Azure Blob container:
    - download the image
    - resize the image to the pre-defined img_width & img_height
    - Featurize the image using the Keras pre-trained ResNet50 model trained on imagenet
    - save the featurized images to a preprocessedimages.pkl file in the provide data directory
    - save a corresponding targets.pkl file with a table of the [name, url] for each image 


In [2]:
import pickle
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from urllib.request import urlopen
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input
from azure.storage.blob import BlockBlobService, PublicAccess

Using TensorFlow backend.


# Define Constants

In [7]:
blob_account_name = 'mmlsparkdemo' #Azure storage account name
blob_sas_token='?st=2019-10-09T18%3A54%3A49Z&se=2020-10-10T18%3A54%3A00Z&sp=rl&sv=2018-03-28&sr=c&sig=sC4kVoSxN93Wd2x4PUCfodMHs2VG6p5%2BEDdIkNUrpTA%3D' # SAS token to access the blob
blob_container = 'met' #container where the images are located
blob_prefix = 'thumbnails/' #path to any sub-folders within the container 

batch_size = 64 
img_width = 512 #input sizes images will be re-szied to
img_height = 512

output_root = '/mnt/met-results/' #root folder filepath. All data from this notebookwill be saved to this directory
features_fn = output_root + 'features.pkl' #name of the np array of size (sample_length, img_length) will be saved. These are the featurized versions of the images
files_fn = output_root + 'filenames.pkl' # helper table that tracks the name & URL for each row


#check that the all the variables have been set
assert blob_account_name != '', 'Please provide the Azure storage account name where the images are stored'
assert blob_sas_token != '', 'Please provide the SAS token for accessing the blob account'
assert blob_container != '', 'Please provide the container name where the images are stored'
assert blob_prefix !='', 'Please provide any additional path compnoents for the imates. Example if the iamges are stored in containername/data/images the prefix is data/images'
assert output_root != '', 'Please provide a filepath for where the data should be saved. Example: /data/'

# Initialize Resnet50 Model

In [8]:
#initialize model
img_length = 2048 #size of output from model
keras_model = ResNet50(input_shape=[img_width,img_height,3], 
                     weights='imagenet', 
                     include_top=False, 
                     pooling='avg')

# Featurize images with ResNet50 Model & Save

In [9]:
import time
start = time.time()

#Connect to the blob
block_blob_service = BlockBlobService(account_name=blob_account_name, sas_token=blob_sas_token)
files = list(block_blob_service.list_blobs(blob_container, prefix=blob_prefix))
n_files = len(files)
print("Found {} files".format(n_files))

Found 170664 files


In [None]:
import math 

urls = ("https://{}.blob.core.windows.net/{}/{}".format(blob_account_name, blob_container, file.name) for file in files)

def preprocess_image(url):
    try:
        with urlopen(url) as file:
            img = Image.open(file)

        #non RGB images won't have the right number of channels
        if img.mode != 'RGB': 
            img = img.convert('RGB') 

        #re-size, expand dims and run through the ResNet50 model
        img = np.array(img.resize((img_width, img_height)))
        img = preprocess_input(img.astype(np.float))
        obj_id = url.split("/")[-1].split(".")[0]
        return (img, obj_id)
    except Exception as e:
        print(e)
        return None

preprocessed = (preprocess_image(url) for url in urls)

obj_ids = []
def batch(iterable, n):
    current_batch = []
    for item in iterable:
        if item is not None:
            current_batch.append(item[0])
            obj_ids.append(item[1])
            if len(current_batch) == n:
                yield np.array(current_batch)
                current_batch = []
    if current_batch:
        yield np.array(current_batch)


batches = batch(preprocessed, batch_size)

predictions = keras_model.predict_generator(batches, steps = math.ceil(n_files/batch_size), verbose=1)


  34/2667 [..............................] - ETA: 1:30:52

In [None]:
import os
if not os.path.exists(output_root): os.makedirs(output_root)
pickle.dump(predictions, open(features_fn, 'wb'))
pickle.dump(obj_ids,open(files_fn,'wb'))
