# Featurize Images for Image Similarity Model

This notebook will do the following:

For each image in the provided Azure Blob container:
    - download the image
    - resize the image to the pre-defined img_width & img_height
    - Featurize the image using the Keras pre-trained ResNet50 model trained on imagenet
    - save the featurized images to a preprocessedimages.pkl file in the provide data directory
    - save a corresponding targets.pkl file with a table of the [name, url] for each image 


In [1]:
import pickle
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from urllib.request import urlopen
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input

Using TensorFlow backend.


# Define Constants

In [11]:
import os 
source_url = 'https://mmlsparkdemo.blob.core.windows.net/met/thumbnails/' #Azure storage account name

batch_size = 128
img_width = 512 #input sizes images will be re-szied to
img_height = 512

output_root = '/mnt/met-results/' #root folder filepath. All data from this notebookwill be saved to this directory
features_fn = os.path.join(output_root, "features", 'features.pkl') #name of the np array of size (sample_length, img_length) will be saved. These are the featurized versions of the images
files_fn = os.path.join(output_root, "features", 'filenames.pkl') # helper table that tracks the name & URL for each row
images_folder = os.path.join(output_root, "images")

#check that the all the variables have been set
assert source_url != '', 'Please provide a filepath for where the data should be saved. Example: /data/'
assert output_root != '', 'Please provide a filepath for where the data should be saved. Example: /data/'

# Initialize Resnet50 Model

In [3]:
#initialize model
img_length = 2048 #size of output from model
keras_model = ResNet50(input_shape=[img_width,img_height,3], 
                     weights='imagenet', 
                     include_top=False, 
                     pooling='avg')

W1216 20:52:26.261084 139917278947072 deprecation.py:506] From /data/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W1216 20:52:27.684316 139917278947072 module_wrapper.py:139] From /data/anaconda/envs/py35/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.



# Featurize images with ResNet50 Model & Save

In [28]:
%%sh -s $source_url $images_folder
azcopy --source $1 --destination $2 --quiet --recursive

[2019/12/16 21:10:17] Transfer summary:
-----------------
Total files transferred: 170677
Transfer successfully:   170677
Transfer skipped:        0
Transfer failed:         0
Elapsed time:            00.00:02:03


In [32]:
import glob

files = list(glob.glob(os.path.join(images_folder, "*")))
print(len(files))

170677


In [39]:
import math 
import random

                  
def batch(iterable, n):
    current_batch = []
    for item in iterable:
        if item is not None:
            current_batch.append(item)
            if len(current_batch) == n:
                yield current_batch
                current_batch = []
    if current_batch:
        yield current_batch


batches = list(batch(files, batch_size))

obj_ids = []
def prep_image_inner(url):
    with open(url, "rb") as file:
        img = Image.open(file)

        #non RGB images won't have the right number of channels
        if img.mode != 'RGB': 
            img = img.convert('RGB') 

        #re-size, expand dims and run through the ResNet50 model
        img = np.array(img.resize((img_width, img_height)))
    img = preprocess_input(img.astype(np.float), mode="tf")
    obj_id = url.split("/")[-1].split(".")[0]
    obj_ids.append((obj_id, url))
    return img


def load_images(urls):
    batch = []
    for url in urls:
        try:
            batch.append(prep_image_inner(url))
        except Exception as e:
            print(e)
            try:
                batch.append(prep_image_inner(url))
            except Exception as e:
                print("Failing a second time", e)
    return np.array(batch)

data_iterator = (load_images(batch) for batch in batches)

predictions = keras_model.predict_generator(data_iterator, steps = len(batches), verbose=1)


cannot identify image file <_io.BufferedReader name='/mnt/met-results/images/344292.jpg'>
Failing a second time cannot identify image file <_io.BufferedReader name='/mnt/met-results/images/344292.jpg'>
  14/1334 [..............................] - ETA: 1:29:46cannot identify image file <_io.BufferedReader name='/mnt/met-results/images/248354.jpg'>
Failing a second time cannot identify image file <_io.BufferedReader name='/mnt/met-results/images/248354.jpg'>
  89/1334 [=>............................] - ETA: 1:23:22cannot identify image file <_io.BufferedReader name='/mnt/met-results/images/282157.jpg'>
Failing a second time cannot identify image file <_io.BufferedReader name='/mnt/met-results/images/282157.jpg'>
 146/1334 [==>...........................] - ETA: 1:19:24

KeyboardInterrupt: 

In [None]:
import os
if not os.path.exists(output_root): os.makedirs(output_root)
pickle.dump(predictions, open(features_fn, 'wb'))
pickle.dump(obj_ids,open(files_fn,'wb'))

print(predictions.shape[0], len(obj_ids))
assert(predictions.shape[0] == len(obj_ids))
