This script is an example of how you could generate features from the product images. It also shows a simple way to read the images using ImageReader class.

Here, we're using the last layer of a convolutional network named Resnet50 as the representatiion of the products.
To run this script you need tensorflow instaled on you env, as well as, the numpy, pandas and PIL packages.

In order to this script to work, we supose that the dataset images folder was unpacked in ../data/images

In [10]:
import os
import time
from pathlib import Path
from typing import Any, Dict, Set, Tuple, Union, List

import numpy as np
import pandas as pd
from PIL import UnidentifiedImageError
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array

IMAGES_ROOT_PATH = "../data/images"

class ImageReader:
    """Load product images from the uncompressed dir you downloaded the images file."""

    def __init__(self, images_root_path=IMAGES_ROOT_PATH):
        self._images_root_path = images_root_path
        self.image_size = (224, 224)
        if not Path(images_root_path).exists():
            raise ValueError(f"Path {images_root_path} does not exist")

    def exists(self, product_id: int) -> bool:
        """Verify that the cache has the given product ID.
        Args:
            product_id: int that represents the product id
        Returns: True if the product id is on cache.
        """
        path = self._local_image_path(product_id=product_id)
        return os.path.exists(path)

    def get(self, product_id: int, target_size: Tuple[int, int]) -> bytearray:
        """Get the image corresponding to the given product ID.
        Args:
            product_id: int that represents the product id
            target_size: Tuple[int, int] of the returned image
        Returns: bytearray representing the image.
        """
        path = self._local_image_path(product_id=product_id)
        try:
            img = image.load_img(path=path, target_size=target_size)
            return img
        except UnidentifiedImageError as image_error:
            print(f"error opening {path}")
            path.unlink()
            raise ValueError(image_error) from image_error
        except FileNotFoundError as image_error:
            print(f"FileNotFoundError {path}")
            raise ValueError(image_error) from image_error
    
    def get_batch(self, product_ids: List[int]):
        images = []
        products_with_images = []
        for product_id in product_ids:
            try:
                image = self.get(
                    product_id=product_id,
                    target_size=self.image_size,
                )
                image = img_to_array(image)
                images.append(image)
                products_with_images.append(product_id)
            except (IOError, ValueError) as ex:
                pass
        
        return np.stack(images), products_with_images

    def store(self, product_id: int, data: bytes) -> None:
        """Store the image corresponding to the given product ID.
        Args:
            product_id: int that represents the product id
            article_photo_id: int that represents the photo_id
            data: bytes representing the image to be stored.
        """
        if len(data) == 0:
            raise ValueError("Trying to store a empty image file")
        path = self._local_image_path(
            product_id=product_id
        )

        path.parent.mkdir(parents=True, exist_ok=True)
        try:
            with open(path, "wb") as img_file:
                img_file.write(data)
        except PermissionError:
            print(f"Error while writing image {str(path)}")

    def _local_image_path(
        self, product_id: Union[int, np.int64]
    ) -> Path:
        if not isinstance(product_id, (int, np.integer)):
            raise ValueError(
                f"product_id and article_photo_id should be integers, "
                f"not {type(product_id)}"
            )
        name = f"{product_id}.jpg"
        path = self._get_folder_path(product_id=product_id)
        return Path(self._images_root_path, path, name)

    @staticmethod
    def _get_folder_path(product_id: int) -> str:
        """Generates a path of a given product ID to download it from CDN
        Args:
            product_id:
        Returns:
        Examples:
            11501771 -> 11/50/17/71
        """
        product_id_str = str(product_id)
        return "/".join(
            [product_id_str[i : i + 2] for i in range(0, len(product_id_str), 2)]
        )

In [6]:
from tensorflow.keras import Sequential
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import Flatten

def get_all_product_ids() -> List[int]:
    items_metadata = pd.read_parquet("../data/items_metadata.parquet")
    return items_metadata["product_id"].unique().tolist()

def get_cnn_model(image_size):
    resnet50 = Sequential()
    resnet50.add(
        ResNet50(
            input_shape=image_size,
            include_top=False,
            weights="imagenet",
            pooling="avg",
        )
    )
    resnet50.add(Flatten())
    return resnet50

def batch_predict(product_ids: List[int], resnet50: Sequential, cache: LocalCache):
    product_ids_with_embeddings = []
    product_embeddings = []
    for i in range(0, len(product_ids), 128):
        batch = product_ids[i: i+128]
        try:
            batch_images, batch_product_ids = cache.get_batch(batch)
            batch_images = preprocess_input(batch_images)
            batch_predictions = resnet50.predict(batch_images)
            product_embeddings.append(batch_predictions)
            product_ids_with_embeddings += batch_product_ids
        except ValueError:
            pass
    return product_ids_with_embeddings, np.vstack(product_embeddings)

In [12]:
cache = LocalCache()
product_ids = get_all_product_ids()
resnet50 = get_cnn_model(image_size=(*cache.image_size, 3))

product_ids_with_embeddings, product_embeddings = batch_predict(product_ids, resnet50, cache)

embeddings = pd.DataFrame(
    product_embeddings,
    columns=[f"embedding_{x}" for x in range(product_embeddings.shape[1])],
)
embeddings["product_id"] = product_ids_with_embeddings

FileNotFoundError ../data/images/10/0/100.jpg
FileNotFoundError ../data/images/10/1/101.jpg
FileNotFoundError ../data/images/10/2/102.jpg
FileNotFoundError ../data/images/10/3/103.jpg
FileNotFoundError ../data/images/10/4/104.jpg
FileNotFoundError ../data/images/10/5/105.jpg
FileNotFoundError ../data/images/10/6/106.jpg
FileNotFoundError ../data/images/10/7/107.jpg
FileNotFoundError ../data/images/10/8/108.jpg
FileNotFoundError ../data/images/10/9/109.jpg
FileNotFoundError ../data/images/11/0/110.jpg
FileNotFoundError ../data/images/11/1/111.jpg
FileNotFoundError ../data/images/11/2/112.jpg
FileNotFoundError ../data/images/11/3/113.jpg
FileNotFoundError ../data/images/11/4/114.jpg
FileNotFoundError ../data/images/11/5/115.jpg
FileNotFoundError ../data/images/11/6/116.jpg
FileNotFoundError ../data/images/11/7/117.jpg
FileNotFoundError ../data/images/11/8/118.jpg
FileNotFoundError ../data/images/11/9/119.jpg
FileNotFoundError ../data/images/12/0/120.jpg
FileNotFoundError ../data/images/1

FileNotFoundError ../data/images/35/6/356.jpg
FileNotFoundError ../data/images/35/7/357.jpg
FileNotFoundError ../data/images/35/8/358.jpg
FileNotFoundError ../data/images/35/9/359.jpg
FileNotFoundError ../data/images/36/0/360.jpg
FileNotFoundError ../data/images/36/1/361.jpg
FileNotFoundError ../data/images/36/2/362.jpg
FileNotFoundError ../data/images/36/3/363.jpg
FileNotFoundError ../data/images/36/4/364.jpg
FileNotFoundError ../data/images/36/5/365.jpg
FileNotFoundError ../data/images/36/6/366.jpg
FileNotFoundError ../data/images/36/7/367.jpg
FileNotFoundError ../data/images/36/8/368.jpg
FileNotFoundError ../data/images/36/9/369.jpg
FileNotFoundError ../data/images/37/0/370.jpg
FileNotFoundError ../data/images/37/1/371.jpg
FileNotFoundError ../data/images/37/2/372.jpg
FileNotFoundError ../data/images/37/3/373.jpg
FileNotFoundError ../data/images/37/4/374.jpg
FileNotFoundError ../data/images/37/5/375.jpg
FileNotFoundError ../data/images/37/6/376.jpg
FileNotFoundError ../data/images/3

In [13]:
embeddings.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_2039,embedding_2040,embedding_2041,embedding_2042,embedding_2043,embedding_2044,embedding_2045,embedding_2046,embedding_2047,product_id
0,0.0,0.394525,0.050905,0.127024,0.0,0.0,0.422399,0.496537,0.0,0.0,...,0.003527,0.469738,0.0,0.175133,0.749388,0.543183,0.0,0.0,0.252123,277
1,0.007891,0.24291,0.028251,0.015308,0.342107,0.787701,1.069703,0.3577,0.0,0.004031,...,0.039697,0.414626,0.0,0.003298,0.276602,0.192753,0.013603,0.010138,0.708869,488
2,0.0,0.138877,0.039775,0.026529,0.0,0.205778,0.572292,1.575165,0.012644,0.082656,...,0.118607,1.400151,0.031379,0.42399,0.585092,0.846554,0.264949,0.097781,0.305391,599
3,0.278602,1.582474,0.040108,0.604458,0.293284,0.06203,0.533228,0.000668,0.111376,0.01137,...,1.048824,0.0,0.0,0.028882,0.0,0.038599,0.006738,1.082916,0.987023,927


In [15]:
import time

unique_name = int(time.time())
embeddings.to_parquet(f"../data/image_embeddings_{unique_name}.parquet")