In [1]:
import os

import numpy as np
import pandas as pd
import torch
from PIL import Image
from torchvision import transforms
from torchvision.models.feature_extraction import create_feature_extractor

# Note: Use pytorch=1.10.0 with "conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 -c pytorch"
# as later versions cause Jupyter to crash.

In [2]:
INPUT_DIRECTORY = './data/raw'
OUTPUT_DIRECTORY = './image_embeddings'

In [3]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet34', pretrained=True)
model.eval()

Using cache found in /Users/pandabear/.cache/torch/hub/pytorch_vision_v0.10.0


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [4]:
# Update model to return embeddings from penultimate layer.
return_nodes = {
    "avgpool": "embedding"
}
model = create_feature_extractor(model, return_nodes=return_nodes)

In [5]:
preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

# Create a function to load and image and convert it to an image embedding
def image_embedding(input_image):
    input_tensor = preprocess(input_image)
    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

    # move the input and model to GPU for speed if available
    if torch.cuda.is_available():
        input_batch = input_batch.to('cuda')
        model.to('cuda')

    with torch.no_grad():
        output = model(input_batch)

    # Convert tensor to numpy array and save to a list
    return output['embedding'][0][:, 0, 0].numpy()

In [6]:
def extract_image_embeddings():
    directories = os.listdir(INPUT_DIRECTORY)
    for directory in sorted(directories):
        if directory.startswith('.'):
            continue
        image_embeddings = []
        image_titles = []
        directory_path = os.path.join(INPUT_DIRECTORY, directory)
        embeddings_path = os.path.join(OUTPUT_DIRECTORY, directory + '.csv')
        if os.path.exists(embeddings_path):
            print(f'Skipping directory {directory} as output already exists')
            continue
        files = os.listdir(directory_path)
        for image in files:
            image_titles.append(image.split('.')[0])
            image_path = directory_path + '/' + image
            input_image = Image.open(image_path)
            if input_image.mode != 'RGB':
                input_image = input_image.convert('RGB')
            image_embeddings.append(image_embedding(input_image))
        embeddings_df = pd.DataFrame(image_embeddings, index=image_titles)
        embeddings_df.to_csv(embeddings_path)
        print(f'Extracted {len(image_embeddings)} embeddings for directory {directory}')

In [7]:
extract_image_embeddings()

Skipping directory 010 as output already exists
Skipping directory 011 as output already exists
Skipping directory 012 as output already exists
Skipping directory 013 as output already exists
Skipping directory 014 as output already exists
Skipping directory 015 as output already exists
Skipping directory 016 as output already exists
Skipping directory 017 as output already exists
Skipping directory 018 as output already exists
Skipping directory 019 as output already exists
Skipping directory 020 as output already exists
Skipping directory 021 as output already exists
Skipping directory 022 as output already exists
Skipping directory 023 as output already exists
Skipping directory 024 as output already exists
Skipping directory 025 as output already exists
Skipping directory 026 as output already exists
Skipping directory 027 as output already exists
Skipping directory 028 as output already exists
Skipping directory 029 as output already exists
Skipping directory 030 as output already

In [8]:
!pwd

/Users/pandabear/springboard/H&M_product_category_cassification


In [16]:
# Merge all the csv files in the image_embeddings folder

df = pd.DataFrame()
for file in os.listdir(OUTPUT_DIRECTORY):
    df2 = pd.read_csv(os.path.join(OUTPUT_DIRECTORY,file))
    df = df.append(df2)
    
df.head()
    

  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)


  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)


  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)
  df = df.append(df2)


Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,217207047,2.679361,0.306564,1.212155,1.18826,0.056715,0.022303,1.147643,0.293646,0.605903,...,0.674172,1.723143,0.104001,0.674032,1.619033,0.617591,0.405708,0.362968,0.49393,2.490882
1,213691080,0.632062,0.070936,1.370101,3.107503,0.912835,0.019927,0.017866,0.426144,0.084189,...,2.272056,0.204228,0.337832,0.126854,0.394283,1.328873,0.304165,0.058076,0.985178,0.030038
2,212629036,1.380411,0.093138,0.224101,0.026163,0.569458,0.200051,0.257274,0.116943,0.425302,...,1.543374,0.145237,0.74695,0.62466,0.352762,0.159102,0.097694,1.121258,0.312964,0.272676
3,214844001,1.017763,0.094177,2.294733,1.414627,0.029835,1.086756,1.917363,0.509459,0.11757,...,3.446632,1.106723,0.128347,1.110429,0.588763,0.000299,0.630921,0.190633,1.509852,3.986516
4,216081011,0.22895,0.253633,0.134512,0.161084,0.62137,0.850047,2.252668,1.000498,0.221402,...,1.13527,0.017909,0.772394,0.775876,3.70126,0.668516,0.094508,0.876413,1.510044,1.602928


In [17]:
df.shape

(105100, 513)

In [18]:
# save to a single csv file
df.to_csv('combined_image_embeddings.csv')