In [11]:
import pandas as pd

In [12]:
classes_df = pd.read_csv("data/open_images/train/metadata/classes.csv", names=["id", "class"])
classes_df

Unnamed: 0,id,class
0,/m/011k07,Tortoise
1,/m/011q46kg,Container
2,/m/012074,Magpie
3,/m/0120dh,Sea turtle
4,/m/01226z,Football
...,...,...
596,/m/0qmmr,Wheelchair
597,/m/0wdt60w,Rugby ball
598,/m/0xfy,Armadillo
599,/m/0xzly,Maracas


In [13]:
class_list = classes_df["class"].tolist()
class_list[:10]

['Tortoise',
 'Container',
 'Magpie',
 'Sea turtle',
 'Football',
 'Ambulance',
 'Ladder',
 'Toothbrush',
 'Syringe',
 'Sink']

In [14]:
with open("data/food_list.txt", "r") as f:
    food_list = f.read().splitlines()
food_list[:10]

['absinth',
 'absinthe',
 'absinthes',
 'absinths',
 'acerola',
 'acerolas',
 'acetum',
 'afters',
 'ail',
 'ails']

In [15]:
open_images_food_list = []
for class_name in class_list:
    if class_name.lower() in food_list:
        open_images_food_list.append(class_name)
open_images_food_list

['Apple',
 'Beer',
 'Cucumber',
 'Radish',
 'Waffle',
 'Pancake',
 'Pretzel',
 'Bagel',
 'Popcorn',
 'Cheese',
 'Muffin',
 'Snack',
 'Juice',
 'Cookie',
 'Cocktail',
 'Dessert',
 'Drink',
 'Zucchini',
 'Coffee',
 'Food',
 'Grape',
 'Artichoke',
 'Milk',
 'Mushroom',
 'Potato',
 'Pasta',
 'Pumpkin',
 'Pear',
 'Mixer',
 'Pizza',
 'Rose',
 'Seafood',
 'Tea',
 'Strawberry',
 'Tomato',
 'Wine',
 'Cream',
 'Bread',
 'Chicken',
 'Squid',
 'Lemon',
 'Banana',
 'Hamburger',
 'Lobster',
 'Orange',
 'Coconut',
 'Vegetable',
 'Shellfish',
 'Cabbage',
 'Carrot',
 'Mango',
 'Pineapple',
 'Candy',
 'Salad',
 'Grinder',
 'Broccoli',
 'Pastry',
 'Pomegranate',
 'Doughnut',
 'Watermelon',
 'Cantaloupe',
 'Sandwich']

In [16]:
classes_df["is_food"] = classes_df["class"].isin(open_images_food_list)
classes_df

Unnamed: 0,id,class,is_food
0,/m/011k07,Tortoise,False
1,/m/011q46kg,Container,False
2,/m/012074,Magpie,False
3,/m/0120dh,Sea turtle,False
4,/m/01226z,Football,False
...,...,...,...
596,/m/0qmmr,Wheelchair,False
597,/m/0wdt60w,Rugby ball,False
598,/m/0xfy,Armadillo,False
599,/m/0xzly,Maracas,False


In [17]:
classes_df.is_food.value_counts()

False    539
True      62
Name: is_food, dtype: int64

In [19]:
open_images_food_df = classes_df[classes_df["is_food"] == True]
open_images_food_df[["id", "class"]].sort_values("class").to_csv("data/open_images_food_classes.csv", header=False, index=False)

In [9]:
# Now want to read all images from Open Images folder and filter out whether the image 
# is food or not.

# Go through Open Images folder (e.g. open_images/test, train, validation)
# Find all images (in "data") folder
# See whether their classification label is food or not 
# Classification label is in ("data/open_images/*/labels/classification.csv")
# If food, move to food_images, if not, move to non_food_images

In [10]:
# Get all image paths from open_images folders
import pathlib
image_path_list = list(pathlib.Path("data/open_images/").glob("*/*/*.jpg"))
len(image_path_list)


300

In [11]:
image_path_list[:10]

[PosixPath('data/open_images/test/data/a63be9e4ea067c1d.jpg'),
 PosixPath('data/open_images/test/data/5a82b29382d32edf.jpg'),
 PosixPath('data/open_images/test/data/504a7622ec7c1fbb.jpg'),
 PosixPath('data/open_images/test/data/c07f825ff2cf4af1.jpg'),
 PosixPath('data/open_images/test/data/65fbccdd5c1071a3.jpg'),
 PosixPath('data/open_images/test/data/90e9af3509ed1263.jpg'),
 PosixPath('data/open_images/test/data/26c8d263abd19e4d.jpg'),
 PosixPath('data/open_images/test/data/980d2ed43c5d94ad.jpg'),
 PosixPath('data/open_images/test/data/d2b6b0832ef9e1d9.jpg'),
 PosixPath('data/open_images/test/data/030918a2abf658f3.jpg')]

In [12]:
image_path_list[0].name.split(".")[0]

'a63be9e4ea067c1d'

In [13]:
# Get downloaded image IDs
image_ids = [image_path.name.split(".")[0] for image_path in image_path_list]
image_ids[:5]

['a63be9e4ea067c1d',
 '5a82b29382d32edf',
 '504a7622ec7c1fbb',
 'c07f825ff2cf4af1',
 '65fbccdd5c1071a3']

In [14]:
class_label_dict = dict(zip(classes_df["id"], classes_df["class"]))
class_label_dict

{'/m/011k07': 'Tortoise',
 '/m/011q46kg': 'Container',
 '/m/012074': 'Magpie',
 '/m/0120dh': 'Sea turtle',
 '/m/01226z': 'Football',
 '/m/012n7d': 'Ambulance',
 '/m/012w5l': 'Ladder',
 '/m/012xff': 'Toothbrush',
 '/m/012ysf': 'Syringe',
 '/m/0130jx': 'Sink',
 '/m/0138tl': 'Toy',
 '/m/013y1f': 'Organ (Musical Instrument)',
 '/m/01432t': 'Cassette deck',
 '/m/014j1m': 'Apple',
 '/m/014sv8': 'Human eye',
 '/m/014trl': 'Cosmetics',
 '/m/014y4n': 'Paddle',
 '/m/0152hh': 'Snowman',
 '/m/01599': 'Beer',
 '/m/01_5g': 'Chopsticks',
 '/m/015h_t': 'Human beard',
 '/m/015p6': 'Bird',
 '/m/015qbp': 'Parking meter',
 '/m/015qff': 'Traffic light',
 '/m/015wgc': 'Croissant',
 '/m/015x4r': 'Cucumber',
 '/m/015x5n': 'Radish',
 '/m/0162_1': 'Towel',
 '/m/0167gd': 'Doll',
 '/m/016m2d': 'Skull',
 '/m/0174k2': 'Washing machine',
 '/m/0174n1': 'Glove',
 '/m/0175cv': 'Tick',
 '/m/0176mf': 'Belt',
 '/m/017ftj': 'Sunglasses',
 '/m/018j2': 'Banjo',
 '/m/018p4k': 'Cart',
 '/m/018xm': 'Ball',
 '/m/01940j': 'Backpa

## Read in Open Images Label Files with Pandas

Using `pandas` is faster than using `csv`.

In [15]:
# Get in classification labels
import pathlib
import pandas as pd

data_path = "data/open_images/"
labels_list = list(pathlib.Path(data_path).glob("*/labels/classifications.csv"))
labels_df_list = []
for labels in labels_list:
    df = pd.read_csv(labels)
    labels_df_list.append(df)

labels_df = (pd.concat(labels_df_list, 
             axis=0, 
             ignore_index=True)).drop(["Source", "Confidence"], axis=1)
labels_df.head()

Unnamed: 0,ImageID,LabelName
0,000026e7ee790996,/m/0cgh4
1,000026e7ee790996,/m/04hgtk
2,000026e7ee790996,/m/0d5gx
3,000026e7ee790996,/m/07j7r
4,000026e7ee790996,/m/04rky


In [16]:
len(labels_df)

10026278

In [17]:
# Find out whether the test image is downloaded
labels_df["downloaded"] = labels_df["ImageID"].isin(image_ids)

# Get a slice of the labels df of only image IDs that are downloaded
downloaded_labels = labels_df[labels_df["downloaded"] == True].copy()

# Add extra columns to the downloaded labels dataframe
downloaded_labels["class_name"] = downloaded_labels["LabelName"].map(class_label_dict)
downloaded_labels["is_food"] = downloaded_labels["class_name"].isin(open_images_food_list)
downloaded_labels

Unnamed: 0,ImageID,LabelName,downloaded,class_name,is_food
9597,02f2eb66e7befc62,/m/09j2d,True,Clothing,False
9598,02f2eb66e7befc62,/m/08dz3q,True,Auto part,False
9599,02f2eb66e7befc62,/m/0k4j,True,Car,False
9600,02f2eb66e7befc62,/m/02dgv,True,Door,False
9601,02f2eb66e7befc62,/m/083wq,True,Wheel,False
...,...,...,...,...,...
10023445,fd32639086efad76,/m/083wq,True,Wheel,False
10024551,fe44691e02b61fb1,/m/04gth,True,Lavender (Plant),False
10024552,fe44691e02b61fb1,/m/07j7r,True,Tree,False
10024553,fe44691e02b61fb1,/m/05s2s,True,Plant,False


In [18]:
len(downloaded_labels)

1681

In [19]:
len(downloaded_labels["ImageID"].unique())

296

In [20]:
def get_food_and_not_food_image_path_lists(dataframe):
    food_image_path_list = []
    not_food_image_path_list = []
    for i, row in enumerate(dataframe.itertuples(index=False)):
        # Get image details
        image_id = row[0]
        class_name = row[3]
        is_food = row[4]

        # Get image path
        image_path = list(pathlib.Path(data_path).glob("*/*/" + image_id + ".jpg"))[0]
        
        # See if image is food or not
        if is_food:
            food_image_path_list.append(image_path)
        else:
            not_food_image_path_list.append(image_path)

    print(f"Found {len(food_image_path_list)} food image label paths")
    print(f"Found {len(not_food_image_path_list)} not food image label paths")
    print(f"Removing duplicates and food image paths from not food paths...")
    food_image_path_set = set(food_image_path_list)
    # Remove food image paths from not food paths (give food label priority)
    not_food_image_path_set = set(not_food_image_path_list) - food_image_path_set
    print(f"Updated lengths:"
        f"\n-> Food images: {len(food_image_path_set)}"
        f"\n-> Not food images: {len(not_food_image_path_set)}"
        f"\n-> Total images: {len(food_image_path_set) + len(not_food_image_path_set)}")

    return list(food_image_path_set), list(not_food_image_path_set)

In [21]:
food_image_path_list, not_food_image_path_list = get_food_and_not_food_image_path_lists(dataframe=downloaded_labels)
len(food_image_path_list), len(not_food_image_path_list)

Found 50 food image label paths
Found 1631 not food image label paths
Removing duplicates and food image paths from not food paths...
Updated lengths:
-> Food images: 29
-> Not food images: 267
-> Total images: 296


(29, 267)

In [22]:
# Move images to food or not food folder
from shutil import copy2
import os
def copy_image_to_folder(dir_name, image_path, is_food):
    if is_food:
        targ_class = "food_images"
        destination_folder = pathlib.Path(dir_name).joinpath(targ_class)
        if not os.path.exists(destination_folder):
            os.makedirs(destination_folder, exist_ok=True)
        destination_path = pathlib.Path(destination_folder).joinpath(image_path.name)
        print(f"Copying {image_path} to {destination_path}...")
        copy2(image_path, destination_path)
    else:
        targ_class = "not_food_images"
        destination_folder = pathlib.Path(dir_name).joinpath(targ_class)
        if not os.path.exists(destination_folder):
            os.makedirs(destination_folder, exist_ok=True)
        destination_path = pathlib.Path(destination_folder).joinpath(image_path.name)
        print(f"Copying {image_path} to {destination_path}...")
        copy2(image_path, destination_path)

In [23]:
def copy_image_list_to_folder(dir_name, image_path_list, is_food):
    for i, image_path in enumerate(image_path_list):
        i += 1
        copy_image_to_folder(dir_name=dir_name,
            image_path=image_path,
            is_food=is_food
        )
    print(f"Total {'food' if is_food else 'not_food'} images copied: {i}")

In [24]:
SAVE_DIR = "data/open_images_extracted"

copy_image_list_to_folder(dir_name=SAVE_DIR,
    image_path_list=food_image_path_list,
    is_food=True
)

copy_image_list_to_folder(dir_name=SAVE_DIR,
    image_path_list=not_food_image_path_list,
    is_food=False
)

Copying data/open_images/validation/data/c1d5dd8b38a5e706.jpg to data/open_images_extracted/food_images/c1d5dd8b38a5e706.jpg...
Copying data/open_images/train/data/28c4da7eb2924364.jpg to data/open_images_extracted/food_images/28c4da7eb2924364.jpg...
Copying data/open_images/test/data/0fc0097be2976acb.jpg to data/open_images_extracted/food_images/0fc0097be2976acb.jpg...
Copying data/open_images/validation/data/468781ab781d74dc.jpg to data/open_images_extracted/food_images/468781ab781d74dc.jpg...
Copying data/open_images/validation/data/69d4899d910c1478.jpg to data/open_images_extracted/food_images/69d4899d910c1478.jpg...
Copying data/open_images/validation/data/115150a6ba4eb0ca.jpg to data/open_images_extracted/food_images/115150a6ba4eb0ca.jpg...
Copying data/open_images/test/data/14cba8a0c786d297.jpg to data/open_images_extracted/food_images/14cba8a0c786d297.jpg...
Copying data/open_images/train/data/3e386a35b26170be.jpg to data/open_images_extracted/food_images/3e386a35b26170be.jpg..

In [9]:
for dir, subdirs, files in os.walk("data/open_images_extracted/"):
    for subdir in subdirs:
        print(f"Total {subdir} images: {len(os.listdir(os.path.join(dir, subdir)))}")

Total food_images images: 29
Total not_food_images images: 267
