This code prepares a DataFrame that can be used for the inception net when aiming for multi-label classification. It simply takes all images and processes the labels to get vectors instead of lists of integers.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os

In [2]:
if os.path.exists('data/prcsd_labels_df'):
    df = pd.read_pickle('data/prcsd_labels_df')
else:
    print('Please run the notebook for processing the labels first!')

In [3]:
df.head(10)

Unnamed: 0,img_url,labels
0,https://blok-production.imgix.net/photos/e7a3e...,[3]
1,https://blok-production.imgix.net/photos/e7a3e...,[3]
2,https://blok-production.imgix.net/photos/e7a3e...,[11]
3,https://blok-production.imgix.net/photos/e7a3e...,[1]
4,https://blok-production.imgix.net/photos/e7a3e...,[1]
5,https://blok-production.imgix.net/photos/e7a3e...,"[5, 7]"
6,https://blok-production.imgix.net/photos/e7a3e...,"[5, 7]"
7,https://blok-production.imgix.net/photos/e7a3e...,"[5, 7]"
8,https://blok-production.imgix.net/photos/e7a3e...,"[0, 2]"
9,https://blok-production.imgix.net/photos/e7a3e...,"[0, 2]"


Add file path and drop URLs:

In [4]:
df['paths'] = df.index.astype(str)
df.paths = df.paths.apply(lambda s: s+'.png')
df.drop(labels=['img_url'], axis=1, inplace=True)
df.head(10)

Unnamed: 0,labels,paths
0,[3],0.png
1,[3],1.png
2,[11],2.png
3,[1],3.png
4,[1],4.png
5,"[5, 7]",5.png
6,"[5, 7]",6.png
7,"[5, 7]",7.png
8,"[0, 2]",8.png
9,"[0, 2]",9.png


In [5]:
len(df)

14033

In [6]:
all_labels = np.sort(np.unique(np.array([e for l in df.labels.values for e in l])))
all_labels

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [7]:
def list_to_categorical(labels):
    ret = np.zeros(len(all_labels))
    for l in labels:
        ret[l] = 1.0
    return ret


df.labels = df.labels.apply(list_to_categorical)

In [8]:
df.head(10)

Unnamed: 0,labels,paths
0,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.png
1,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.png
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2.png
3,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.png
4,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.png
5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...",5.png
6,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...",6.png
7,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...",7.png
8,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.png
9,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.png


Nice! Write to file.

In [9]:
df.to_pickle('data/inception_df_multilabel_v1')