# Notebook to explore the naming of the class names for FoodVision

* **Goal:** Get the class names for FoodVision and see which ones can be broken down into something more specific.


In [7]:
# Append the upper level directory to sys
import os
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from pathlib import Path

import torch

from utils.misc import sort_dict_by_values

# Get config
from configs.default_config import config

args = config

# Connect to GCP
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials(path_to_key="../utils/google-storage-key.json")
test_gcp_connection()

import wandb

# Initialize a new run
from utils.wandb_utils import wandb_load_artifact, wandb_download_and_load_labels

notes = f"autolabel new images"

run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type,
                 tags=['auto_label_new_images'],
                 notes=notes)

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)

len(annotations.label.unique())

[INFO] GCP credentials set!
[INFO] GCP connection successful! Access to GCP for saving/loading data and models available.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[INFO] Labels directory: ./artifacts/food_vision_labels:v82
[INFO] Labels path: artifacts/food_vision_labels:v82/annotations.csv
[INFO] Working with: 361 classes


361

In [8]:
# # Get list of images in foodvision/artifacts/food_vision_199_classes_images:v19
# image_path_list = list(Path("../artifacts/food_vision_199_classes_images:v19").rglob("*.jp*g"))

# # Convert image paths to a list of image paths name
# image_path_list = [str(image_path.name) for image_path in image_path_list]
# len(image_path_list)

In [9]:
# # Remove the labels that aren't paired with an image
# updated_annotations = annotations.copy()
# print(f"Length of annotations: {len(updated_annotations)}")

# # Remove updated_annotations that aren't in image_path_list
# updated_annotations = updated_annotations[updated_annotations.image_name.isin(image_path_list)]
# print(f"Length of annotations after removing labels that aren't in image_path_list: {len(updated_annotations)}")

In [10]:
GS_BUCKET = config.gs_bucket_name
GS_BUCKET

import os
food_not_food_image_path = os.path.join("gs://", GS_BUCKET, "food_not_food", "images")
food_not_food_image_path

'gs://food_vision_bucket_with_object_versioning/food_not_food/images'

In [11]:
# # Track Food Not Food images in Wandb
# from utils.wandb_utils import wandb_add_artifact_with_reference

# wandb_add_artifact_with_reference(wandb_run=run,
#                                   artifact_name="food_not_food_images",
#                                   artifact_type="dataset",
#                                   description="Images of food and not food",
#                                   reference_path=food_not_food_image_path)

In [12]:
class_dict

{0: 'achacha',
 1: 'almond_butter',
 2: 'almonds',
 3: 'apple_custard',
 4: 'apple_green',
 5: 'apple_red',
 6: 'apricot',
 7: 'artichoke',
 8: 'asparagus',
 9: 'avocado',
 10: 'bacon',
 11: 'bacon_and_egg_burger',
 12: 'bagel',
 13: 'baklava',
 14: 'banana',
 15: 'banana_bread',
 16: 'barbecue_sauce',
 17: 'basil',
 18: 'bay_leaves',
 19: 'bean_snake',
 20: 'bean_sprouts',
 21: 'beans',
 22: 'beef_corned',
 23: 'beef_curry',
 24: 'beef_diced',
 25: 'beef_kebab',
 26: 'beef_mince',
 27: 'beef_patty',
 28: 'beef_roast',
 29: 'beef_stir_fry',
 30: 'beer',
 31: 'beetroot',
 32: 'biltong',
 33: 'biscuits',
 34: 'black_pepper',
 35: 'blackberries',
 36: 'blueberries',
 37: 'bok_choy',
 38: 'brazil_nuts',
 39: 'bread',
 40: 'bread_naan',
 41: 'breadfruit',
 42: 'broccoli',
 43: 'broccolini',
 44: 'brownie',
 45: 'brussel_sprouts',
 46: 'burrito',
 47: 'butter',
 48: 'cabbage_green',
 49: 'cabbage_red',
 50: 'cabbage_savoy',
 51: 'calamari',
 52: 'candy',
 53: 'cape_gooseberries',
 54: 'capsi

In [13]:
annotations.image_source.value_counts()

vegfru_dataset                    71090
clip_retrieval_laion_5b_knn       46274
internet_download                 23992
manual_upload                     15371
bing_image_search_api_download     7833
Name: image_source, dtype: int64

In [14]:
# Find the number of classes in annotations with under 100 manually uploaded labels
class_image_source_counts = annotations.groupby(["class_name", "image_source"]).size().reset_index(name="counts")
image_source = "manual_upload"
class_image_source_counts_filtered = class_image_source_counts[class_image_source_counts.image_source == image_source][["class_name", "counts"]]
class_image_source_counts_filtered = class_image_source_counts_filtered[class_image_source_counts_filtered.counts < 100].sort_values(by="counts", ascending=False)

# Turn class_image_source_counts_filtered into a dictionary of class_name and counts
class_image_source_counts_filtered_dict = dict(zip(class_image_source_counts_filtered.class_name, class_image_source_counts_filtered.counts))
class_image_source_counts_filtered_dict

{'plum': 95,
 'lime': 90,
 'coffee': 85,
 'muffin': 82,
 'steak': 81,
 'eggs': 80,
 'water': 79,
 'pumpkin': 79,
 'watermelon': 78,
 'orange_juice': 77,
 'sandwich_toasted': 70,
 'onion_red': 70,
 'chicken_thighs': 66,
 'milk': 64,
 'rice_white': 63,
 'dates': 63,
 'coleslaw': 62,
 'tea': 60,
 'cookies': 60,
 'apple_green': 59,
 'cheese': 58,
 'potato_royal_blue': 58,
 'sausages': 58,
 'potato_white_sweet': 57,
 'salt': 57,
 'olives': 57,
 'mushroom_shiitake': 54,
 'potato_purple_congo': 53,
 'butter': 51,
 'pickles': 50,
 'wombok': 50,
 'nectarine': 50,
 'bacon': 49,
 'sushi': 48,
 'potato_kipfler': 48,
 'bacon_and_egg_burger': 47,
 'beef_curry': 45,
 'pomegranate': 42,
 'turnip': 41,
 'chocolate': 41,
 'chicken_bbq': 40,
 'curry_chicken': 40,
 'melon_candy': 40,
 'brownie': 40,
 'yoghurt': 37,
 'honey': 37,
 'kohlrabi': 36,
 'peach': 36,
 'green_beans': 36,
 'chicken_breast': 35,
 'biltong': 35,
 'potato_purple_sweet': 33,
 'mushroom_king_trumpet': 31,
 'melon_bitter': 31,
 'potato_b

In [7]:
class_image_source_counts_filtered = class_image_source_counts[class_image_source_counts.image_source == image_source][["class_name", "counts"]]
class_image_source_countes_manual_upload = class_image_source_counts_filtered.sort_values(by="counts", ascending=False)
class_image_source_countes_manual_upload_dict = dict(zip(class_image_source_countes_manual_upload.class_name, class_image_source_countes_manual_upload.counts))

In [22]:
# Export class_dict to CSV
class_dict_df = pd.DataFrame.from_dict(class_dict, orient='index', columns=['class_name'])
class_dict_df.to_csv("class_dict.csv")

## Import class names

In [26]:
harris_farm = pd.read_csv("nutrify_food_icons_to_create.csv")
print(f"Num rows: {len(harris_farm)}")
harris_farm.head()

Num rows: 146


Unnamed: 0,icon_name,category,done,notes,folder_link,image_link,last_edited,revisions,comments
0,eggs,animal_product,1.0,,https://drive.google.com/drive/folders/1S_Jx6C...,nutrify_icon_egg.png,2/18/2023,nutrify_icon_egg_revision01.png,
1,apple_green,fruit,1.0,,https://drive.google.com/drive/folders/1kR3etD...,nutrify_icon_apple_green.png,2/18/2023,nutrify_icon_apple_green02_revision01.png,Could we please get slices of Apple/green appl...
2,apple_red,fruit,1.0,,https://drive.google.com/drive/folders/1EtnwDJ...,nutrify_icon_apple_red.png,2/18/2023,nutrify_icon_apple_red02_revision01.svg,
3,apricot,fruit,1.0,,https://drive.google.com/drive/folders/1QLw9dl...,nutrify_icon_apricot02.png,2/18/2023,nutrify_icon_apricot02_revision01.png,
4,avocado,fruit,1.0,,https://drive.google.com/drive/folders/1ZEMBcW...,nutrify_icon_avocado02.png,2/18/2023,nutrify_icon_avocado02_revision01.png,


In [27]:
# Drop rows if the nutrify_class_name is nan
harris_farm = harris_farm.dropna(subset=['icon_name'])
print(f"Num rows: {len(harris_farm)}")

Num rows: 142


In [28]:
# Only keep columns "icon_name"
harris_farm = harris_farm[["icon_name"]]
harris_farm.head()

Unnamed: 0,icon_name
0,eggs
1,apple_green
2,apple_red
3,apricot
4,avocado


In [45]:
harris_farm_classes = harris_farm.icon_name.unique()

# Remove item that contains "example"
harris_farm_classes = sorted([x for x in harris_farm_classes if "example" not in x])

# Remove seasons
seasons = ["spring", "summer", "autumn", "winter"]
harris_farm_classes = sorted([x for x in harris_farm_classes if x not in seasons])

# Remove egg variants
egg_variants = ['egg_fried',
 'egg_hard_boiled',
 'egg_raw',
 'egg_soft_boiled',]
harris_farm_classes = sorted([x for x in harris_farm_classes if x not in egg_variants])

# Rename brussel_sprout -> brussel_sprouts
harris_farm_classes = sorted([x.replace("brussel_sprout", "brussel_sprouts") for x in harris_farm_classes])

# Rename enoki_mushroom -> enoki_mushrooms
harris_farm_classes = sorted([x.replace("enoki_mushrooms", "enoki_mushroom") for x in harris_farm_classes])

# Rename parsnip -> parsnips
harris_farm_classes = sorted([x.replace("parsnip", "parsnips") for x in harris_farm_classes])

# Rename kiwi_fruit -> kiwifruit
harris_farm_classes = sorted([x.replace("kiwifruit", "kiwi_fruit") for x in harris_farm_classes])

# Rename taroroot -> taro
harris_farm_classes = sorted([x.replace("taroroot", "taro") for x in harris_farm_classes])

# Rename star_fruit -> starfruit
harris_farm_classes = sorted([x.replace("star_fruit", "starfruit") for x in harris_farm_classes])

# Rename snowpeas -> snow_peas
harris_farm_classes = sorted([x.replace("snowpeas", "snow_peas") for x in harris_farm_classes])

# Rename lettuce_cos -> cos_lettuce
harris_farm_classes = sorted([x.replace("lettuce_cos", "cos_lettuce") for x in harris_farm_classes])

# Rename lettuce_iceberg -> iceberg_lettuce
harris_farm_classes = sorted([x.replace("lettuce_iceberg", "iceberg_lettuce") for x in harris_farm_classes])

# Rename hazelnut -> hazelnuts
harris_farm_classes = sorted([x.replace("hazelnut", "hazelnuts") for x in harris_farm_classes])
# harris_farm_classes = sorted([x.replace("lettuce_iceberg", "iceberg_lettuce") for x in harris_farm_classes])

# Compare the unique class names to the class names in the annotations
harris_farm_class_names_not_in_nutrify = sorted([class_name for class_name in harris_farm_classes if class_name not in class_names])
len(harris_farm_class_names_not_in_nutrify), harris_farm_class_names_not_in_nutrify

(28,
 ['artichoke',
  'bay_leaves',
  'bean_sprouts',
  'cabbage_green',
  'capsicum_green',
  'capsicum_orange',
  'capsicum_red',
  'capsicum_yellow',
  'chervil',
  'chicory',
  'choy_sum',
  'curry_leaves',
  'dill',
  'jalapeno',
  'lemongrass',
  'lime_leaves',
  'marjoram',
  'oregano',
  'peanuts',
  'quince',
  'sage',
  'silverbeet',
  'swede',
  'tarragon',
  'thyme',
  'turmeric',
  'witlof',
  'wombok'])

In [46]:
# Notes on Nutrify classes
# Spread out: ✅
# cabbage -> cabbage_green/cabbage_red
# radish -> radish_red/radish_white (also called daikon_radish)
# lettuce -> lettuce_cos/lettuce_iceberg/lettuce_oakleaf 
# capsicum -> capsicum_green/capsicum_red/capsicum_yellow/capsicum_orange

# Merge: ✅
# nectarine + nectarines -> nectarine
# beef + steak -> steak (steak = cut of meat, beef = whole animal)

# Rename: ✅
# white_wine -> wine_white
# red_wine -> wine_red
# cherry_tomato -> tomato_cherry
# mashed_potato -> potato_mashed
# sweet_potato -> potato_sweet
# purple_sweet_potato -> potato_purple_sweet 
# nuts -> mixed_nuts
# roast_beef -> beef_roast
# roast_pork -> pork_roast
# roast_potatoes -> potato_roast
# enoki_mushroom -> mushroom_enoki

# New:
# rice -> rice_white, rice_brown, rice_fried
# beef_pattie -> beef_patty

# Multi names:
# onion_green -> spring_onion, bunching_onion
# shallots -> shallot, eschalot (also called eschallot), french shallot, scallion
# soda -> soft drink


cabbage_options = ["cabbage_green", "cabbage_red"]
radish_options = ["radish_red", "radish_white"]
lettuce_options = ["lettuce_cos", "lettuce_iceberg", "lettuce_oakleaf"]

lettuce_rename = {"oakleaf_lettuce": "lettuce_oakleaf",
                  "cos_lettuce": "lettuce_cos",
                  "iceberg_lettuce": "lettuce_iceberg"}

capsicum_options = ["capsicum_green", "capsicum_red", "capsicum_yellow", "capsicum_orange"]


# For VegFru
# "globe_artichoke" in VegFru is the same as "artichoke" in Harris Farm
# "chicory" in VegFru is the same as "chicory" in Harris Farm

In [47]:
# Combine the class names
combined_class_names = sorted(list(set(class_names + harris_farm_class_names_not_in_nutrify)))
len(combined_class_names), combined_class_names

(263,
 ['almond_butter',
  'almonds',
  'apple_green',
  'apple_red',
  'apricot',
  'artichoke',
  'asparagus',
  'avocado',
  'bacon',
  'bacon_and_egg_burger',
  'bagel',
  'baklava',
  'banana',
  'banana_bread',
  'barbecue_sauce',
  'basil',
  'bay_leaves',
  'bean_sprouts',
  'beans',
  'beef',
  'beef_curry',
  'beef_mince',
  'beef_stir_fry',
  'beer',
  'beetroot',
  'biltong',
  'blackberries',
  'blueberries',
  'bok_choy',
  'bread',
  'bread_naan',
  'broccoli',
  'broccolini',
  'brownie',
  'brussel_sprouts',
  'burrito',
  'butter',
  'cabbage',
  'cabbage_green',
  'cabbage_red',
  'calamari',
  'candy',
  'capsicum',
  'capsicum_green',
  'capsicum_orange',
  'capsicum_red',
  'capsicum_yellow',
  'carrot',
  'cashews',
  'cauliflower',
  'celery',
  'cheese',
  'cheeseburger',
  'cherries',
  'cherry_tomato',
  'chervil',
  'chicken_breast',
  'chicken_thighs',
  'chicken_wings',
  'chicory',
  'chilli',
  'chimichurri',
  'chives',
  'chocolate',
  'chocolate_cake'

## Update annotations

In [65]:
original_annotations = annotations.copy()
updated_annotations = original_annotations.copy()

In [66]:
lettuce_rows = updated_annotations[updated_annotations['class_name'] == "lettuce"]

# Get the indices of the rows which have the name "lettuce"
lettuce_indices = lettuce_rows.index

len(lettuce_rows)

# Rename all rows with the name "lettuce" to be "lettuce_iceberg"
updated_annotations.loc[lettuce_indices, 'class_name'] = "lettuce_iceberg"

In [67]:
# How many rows have the name "lettuce"?
len(updated_annotations.class_name.unique())

235

In [68]:
lettuce_rename = {"oakleaf_lettuce": "lettuce_oakleaf",
                  "cos_lettuce": "lettuce_cos",
                  "iceberg_lettuce": "lettuce_iceberg"}

# Rename all rows containing "lettuce" to be a map of lettuce_rename
for key, value in lettuce_rename.items():
    updated_annotations.loc[updated_annotations['class_name'].str.contains(key), 'class_name'] = value

# How many rows contain "lettuce"?
len(updated_annotations[updated_annotations['class_name'].str.contains("lettuce")])


898

In [69]:
# Rename all rows containing "capsicum" to be a random selection of capsicum_options
capsicum_options = ["capsicum_green", "capsicum_red", "capsicum_yellow", "capsicum_orange"]

capsicum_rows = updated_annotations[updated_annotations['class_name'] == "capsicum"]
capsicum_indices = capsicum_rows.index

updated_annotations.loc[capsicum_indices, 'class_name'] = np.random.choice(capsicum_options, size=len(capsicum_indices))

# How many rows contain "capsicum"?
len(updated_annotations[updated_annotations['class_name'].str.contains("capsicum")])

188

In [70]:
# cabbage -> cabbage_green/cabbage_red
cabbage_options = ["cabbage_green", "cabbage_red"]

cabbage_rows = updated_annotations[updated_annotations['class_name'] == "cabbage"]
cabbage_indices = cabbage_rows.index

updated_annotations.loc[cabbage_indices, 'class_name'] = np.random.choice(cabbage_options, size=len(cabbage_indices))

# How many rows contain "cabbage"?
print(len(updated_annotations[updated_annotations['class_name'].str.contains("cabbage")]))
updated_annotations[updated_annotations['class_name'].str.contains("cabbage")]



570


Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
4324,test/cabbage/be1e1d5a-ffac-4765-93bf-b1166e887...,be1e1d5a-ffac-4765-93bf-b1166e887690.jpg,cabbage_red,33,test,,,,,,internet_download
4325,test/cabbage/04ebdc28-b7e3-4fdd-898d-ffee91fc9...,04ebdc28-b7e3-4fdd-898d-ffee91fc94a6.jpg,cabbage_red,33,test,,,,,,internet_download
4326,test/cabbage/0850c00d-d095-48f5-9487-b0bbb4770...,0850c00d-d095-48f5-9487-b0bbb4770db0.jpg,cabbage_green,33,test,,,,,,internet_download
4327,test/cabbage/a522de68-0465-4a41-b4ff-080d5c77a...,a522de68-0465-4a41-b4ff-080d5c77a1b0.jpg,cabbage_red,33,test,,,,,,internet_download
4328,test/cabbage/d76464ac-af9b-4c35-aa77-193eba9a4...,d76464ac-af9b-4c35-aa77-193eba9a4576.jpg,cabbage_green,33,test,,,,,,internet_download
...,...,...,...,...,...,...,...,...,...,...,...
106417,all_nutrify_vegfru/cabbage_red/1057a6e5-2642-4...,1057a6e5-2642-4d92-be09-42549d9cf8f8.jpeg,cabbage_red,234,test,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
106418,all_nutrify_vegfru/cabbage_red/4339fbcc-eeee-4...,4339fbcc-eeee-4add-ae8f-47d2fa9c12ea.jpeg,cabbage_red,234,test,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
106419,all_nutrify_vegfru/cabbage_red/2bccd51a-cfdd-4...,2bccd51a-cfdd-48ab-bb7a-7ed09db137b9.jpeg,cabbage_red,234,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
106420,all_nutrify_vegfru/cabbage_red/e19dbffb-f18e-4...,e19dbffb-f18e-4c83-ae7a-57c073cad300.jpeg,cabbage_red,234,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset


In [71]:
# radish -> radish_red/radish_white (also called daikon_radish)
radish_options = ["radish_red", "radish_white"]

radish_rows = updated_annotations[updated_annotations['class_name'] == "radish"]
radish_indices = radish_rows.index

updated_annotations.loc[radish_indices, 'class_name'] = np.random.choice(radish_options, size=len(radish_indices))

# How many rows contain "radish"?
len(updated_annotations[updated_annotations['class_name'].str.contains("radish")])
updated_annotations[updated_annotations['class_name'].str.contains("radish")]

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
2097,test/daikon_radish/ff48d84c-d20f-4166-a12f-cb1...,ff48d84c-d20f-4166-a12f-cb1158bac3cd.jpg,daikon_radish,64,test,,,,,,internet_download
2098,test/daikon_radish/fbe10a12-5171-4f91-8789-8d8...,fbe10a12-5171-4f91-8789-8d8790a4ad07.jpg,daikon_radish,64,test,,,,,,internet_download
2099,test/daikon_radish/aebe0abd-fe6f-41db-b83d-d26...,aebe0abd-fe6f-41db-b83d-d26fc327a724.jpg,daikon_radish,64,test,,,,,,internet_download
2100,test/daikon_radish/29c636b5-9bfd-4299-8e25-c09...,29c636b5-9bfd-4299-8e25-c09728ee7edf.jpg,daikon_radish,64,test,,,,,,internet_download
2101,test/daikon_radish/2e27848e-858e-45d7-8495-5c8...,2e27848e-858e-45d7-8495-5c8fc91d8e89.jpg,daikon_radish,64,test,,,,,,internet_download
...,...,...,...,...,...,...,...,...,...,...,...
96543,all_nutrify_vegfru/radish_red/9ef1a88c-2282-43...,9ef1a88c-2282-430c-a785-4b53fb2f79bc.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
96544,all_nutrify_vegfru/radish_red/39777b14-fc09-48...,39777b14-fc09-485f-916d-037270d45586.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
96545,all_nutrify_vegfru/radish_red/ca1a4aa5-4924-49...,ca1a4aa5-4924-4991-bfb9-90c35cbfc9d5.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
96546,all_nutrify_vegfru/radish_red/9954879b-6392-41...,9954879b-6392-41f2-bfb7-8883fcb4e6b1.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset


In [72]:
# daikon_radish -> radish_white (also called daikon_radish)
daikon_radish_options = ["radish_white"]

daikon_radish_rows = updated_annotations[updated_annotations['class_name'] == "daikon_radish"]
daikon_radish_indices = daikon_radish_rows.index

updated_annotations.loc[daikon_radish_indices, 'class_name'] = np.random.choice(daikon_radish_options, size=len(daikon_radish_indices))

# How many rows contain "radish"?
len(updated_annotations[updated_annotations['class_name'].str.contains("radish")])
updated_annotations[updated_annotations['class_name'].str.contains("radish")]

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
2097,test/daikon_radish/ff48d84c-d20f-4166-a12f-cb1...,ff48d84c-d20f-4166-a12f-cb1158bac3cd.jpg,radish_white,64,test,,,,,,internet_download
2098,test/daikon_radish/fbe10a12-5171-4f91-8789-8d8...,fbe10a12-5171-4f91-8789-8d8790a4ad07.jpg,radish_white,64,test,,,,,,internet_download
2099,test/daikon_radish/aebe0abd-fe6f-41db-b83d-d26...,aebe0abd-fe6f-41db-b83d-d26fc327a724.jpg,radish_white,64,test,,,,,,internet_download
2100,test/daikon_radish/29c636b5-9bfd-4299-8e25-c09...,29c636b5-9bfd-4299-8e25-c09728ee7edf.jpg,radish_white,64,test,,,,,,internet_download
2101,test/daikon_radish/2e27848e-858e-45d7-8495-5c8...,2e27848e-858e-45d7-8495-5c8fc91d8e89.jpg,radish_white,64,test,,,,,,internet_download
...,...,...,...,...,...,...,...,...,...,...,...
96543,all_nutrify_vegfru/radish_red/9ef1a88c-2282-43...,9ef1a88c-2282-430c-a785-4b53fb2f79bc.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
96544,all_nutrify_vegfru/radish_red/39777b14-fc09-48...,39777b14-fc09-485f-916d-037270d45586.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
96545,all_nutrify_vegfru/radish_red/ca1a4aa5-4924-49...,ca1a4aa5-4924-4991-bfb9-90c35cbfc9d5.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
96546,all_nutrify_vegfru/radish_red/9954879b-6392-41...,9954879b-6392-41f2-bfb7-8883fcb4e6b1.jpeg,radish_red,215,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset


In [74]:
# How many class_names contain "radish"?
for class_name in updated_class_names:
    if "radish" in class_name:
        print(class_name)

radish_white
radish_red


In [75]:
# Combine nectarine and nectarines -> nectarine
updated_annotations.loc[updated_annotations['class_name'] == "nectarines", 'class_name'] = "nectarine"


In [79]:
# Combine steak + beef -> steak (steak = cut of meat, beef = whole animal)
updated_annotations.loc[updated_annotations['class_name'] == "beef", 'class_name'] = "steak"

In [90]:
# Rename the following classes:
# white_wine -> wine_white
# red_wine -> wine_red
# cherry_tomato -> tomato_cherry
# mashed_potato -> potato_mashed
# sweet_potato -> potato_sweet
# purple_sweet_potato -> potato_purple_sweet
# nuts -> mixed_nuts
# roast_beef -> beef_roast
# roast_pork -> pork_roast
# roast_potatoes -> potato_roast
# enoki_mushroom -> mushroom_enoki

updated_annotations.loc[updated_annotations['class_name'] == "white_wine", 'class_name'] = "wine_white"
updated_annotations.loc[updated_annotations['class_name'] == "red_wine", 'class_name'] = "wine_red"
updated_annotations.loc[updated_annotations['class_name'] == "cherry_tomato", 'class_name'] = "tomato_cherry"
updated_annotations.loc[updated_annotations['class_name'] == "mashed_potato", 'class_name'] = "potato_mashed"
updated_annotations.loc[updated_annotations['class_name'] == "sweet_potato", 'class_name'] = "potato_sweet"
updated_annotations.loc[updated_annotations['class_name'] == "purple_sweet_potato", 'class_name'] = "potato_purple_sweet"
updated_annotations.loc[updated_annotations['class_name'] == "nuts", 'class_name'] = "mixed_nuts"
updated_annotations.loc[updated_annotations['class_name'] == "roast_beef", 'class_name'] = "beef_roast"
updated_annotations.loc[updated_annotations['class_name'] == "roast_pork", 'class_name'] = "pork_roast"
updated_annotations.loc[updated_annotations['class_name'] == "roast_potatoes", 'class_name'] = "potato_roast"
updated_annotations.loc[updated_annotations['class_name'] == "enoki_mushroom", 'class_name'] = "mushroom_enoki"

In [93]:
# Get unique class_names
def get_unique_class_names():
    updated_class_names = sorted(updated_annotations.class_name.unique())
    print(len(updated_class_names), updated_class_names)
    return sorted(updated_class_names)

updated_class_names = get_unique_class_names()

233 ['almond_butter', 'almonds', 'apple_green', 'apple_red', 'apricot', 'asparagus', 'avocado', 'bacon', 'bacon_and_egg_burger', 'bagel', 'baklava', 'banana', 'banana_bread', 'barbecue_sauce', 'basil', 'beans', 'beef_curry', 'beef_mince', 'beef_roast', 'beef_stir_fry', 'beer', 'beetroot', 'biltong', 'blackberries', 'blueberries', 'bok_choy', 'bread', 'bread_naan', 'broccoli', 'broccolini', 'brownie', 'brussel_sprouts', 'burrito', 'butter', 'cabbage_green', 'cabbage_red', 'calamari', 'candy', 'capsicum_green', 'capsicum_orange', 'capsicum_red', 'capsicum_yellow', 'carrot', 'cashews', 'cauliflower', 'celery', 'cheese', 'cheeseburger', 'cherries', 'chicken_breast', 'chicken_thighs', 'chicken_wings', 'chilli', 'chimichurri', 'chives', 'chocolate', 'chocolate_cake', 'coconut', 'coffee', 'coleslaw', 'cookies', 'coriander', 'corn', 'corn_chips', 'cream', 'croissant', 'crumbed_chicken', 'cucumber', 'cupcake', 'curry_chicken', 'dates', 'donuts', 'dragonfruit', 'edamame', 'eggplant', 'eggs', 'en

In [95]:
updated_class_names_and_harris_farm = sorted(list(set(updated_class_names + harris_farm_class_names_not_in_nutrify)))
len(updated_class_names_and_harris_farm), updated_class_names_and_harris_farm

(256,
 ['almond_butter',
  'almonds',
  'apple_green',
  'apple_red',
  'apricot',
  'artichoke',
  'asparagus',
  'avocado',
  'bacon',
  'bacon_and_egg_burger',
  'bagel',
  'baklava',
  'banana',
  'banana_bread',
  'barbecue_sauce',
  'basil',
  'bay_leaves',
  'bean_sprouts',
  'beans',
  'beef_curry',
  'beef_mince',
  'beef_roast',
  'beef_stir_fry',
  'beer',
  'beetroot',
  'biltong',
  'blackberries',
  'blueberries',
  'bok_choy',
  'bread',
  'bread_naan',
  'broccoli',
  'broccolini',
  'brownie',
  'brussel_sprouts',
  'burrito',
  'butter',
  'cabbage_green',
  'cabbage_red',
  'calamari',
  'candy',
  'capsicum_green',
  'capsicum_orange',
  'capsicum_red',
  'capsicum_yellow',
  'carrot',
  'cashews',
  'cauliflower',
  'celery',
  'cheese',
  'cheeseburger',
  'cherries',
  'chervil',
  'chicken_breast',
  'chicken_thighs',
  'chicken_wings',
  'chicory',
  'chilli',
  'chimichurri',
  'chives',
  'chocolate',
  'chocolate_cake',
  'choy_sum',
  'coconut',
  'coffee',

In [146]:
# Create a list of classes to add
classes_to_add = ["hash_brown", "rice_white", "rice_brown", "rice_fried"]

# Remove "rice" (because it's being expanded)
updated_class_names_and_harris_farm = [class_name for class_name in updated_class_names_and_harris_farm if class_name != "rice"]

# Add classes to updated list
updated_class_names_and_harris_farm = sorted(list(set(updated_class_names_and_harris_farm + classes_to_add)))

len(updated_class_names_and_harris_farm), updated_class_names_and_harris_farm

(259,
 ['almond_butter',
  'almonds',
  'apple_green',
  'apple_red',
  'apricot',
  'artichoke',
  'asparagus',
  'avocado',
  'bacon',
  'bacon_and_egg_burger',
  'bagel',
  'baklava',
  'banana',
  'banana_bread',
  'barbecue_sauce',
  'basil',
  'bay_leaves',
  'bean_sprouts',
  'beans',
  'beef_curry',
  'beef_mince',
  'beef_roast',
  'beef_stir_fry',
  'beer',
  'beetroot',
  'biltong',
  'blackberries',
  'blueberries',
  'bok_choy',
  'bread',
  'bread_naan',
  'broccoli',
  'broccolini',
  'brownie',
  'brussel_sprouts',
  'burrito',
  'butter',
  'cabbage_green',
  'cabbage_red',
  'calamari',
  'candy',
  'capsicum_green',
  'capsicum_orange',
  'capsicum_red',
  'capsicum_yellow',
  'carrot',
  'cashews',
  'cauliflower',
  'celery',
  'cheese',
  'cheeseburger',
  'cherries',
  'chervil',
  'chicken_breast',
  'chicken_thighs',
  'chicken_wings',
  'chicory',
  'chilli',
  'chimichurri',
  'chives',
  'chocolate',
  'chocolate_cake',
  'choy_sum',
  'coconut',
  'coffee',

In [None]:
# New
# hash_brown 
# rice -> rice_white, rice_brown, rice_fried
# beef_patty -> beef_pattie

# Next
# Merge updated class_names with Nutrify original class names (replace and rename and train another model)
# Get images for new Harris Farm class names and upload to model
# Combine all class names into largest model yet
# Start creating a dictionary/JSON of class names and their synonyms

In [142]:
from typing import List, Dict, Tuple

def get_updated_class_names_class_dict_and_reverse_class_dict(df: pd.DataFrame) -> Tuple[List[str], Dict[int, str], Dict[str, int]]:
    """Get an updated class names list, class dict and reverse class dict from a dataframe."""
    updated_class_names = sorted(df.class_name.unique())
    updated_class_dict = {i: class_name for i, class_name in enumerate(updated_class_names)}
    updated_reverse_class_dict = {class_name: i for i, class_name in enumerate(updated_class_names)}
    assert len(updated_class_dict) == len(updated_reverse_class_dict), "Class dict and reverse class dict are not the same length"
    return updated_class_names, updated_class_dict, updated_reverse_class_dict

def map_updated_class_dict_to_updated_annotations(df: pd.DataFrame, updated_reverse_class_dict: Dict[str, int]) -> pd.DataFrame:
    """Map updated class dict to updated annotations dataframe.
    
    For example, go from {"apple_red": 1} -> df["label"] = 1
    """
    updated_annotations = df.copy()
    updated_annotations.loc[:, 'label'] = updated_annotations['class_name'].map(updated_reverse_class_dict)
    assert len(updated_annotations.label.unique()) == len(updated_reverse_class_dict), "Number of unique labels in updated annotations does not match number of unique class names in updated reverse class dict"
    return updated_annotations

In [143]:
updated_class_names, updated_class_dict, updated_reverse_class_dict = get_updated_class_names_class_dict_and_reverse_class_dict(updated_annotations)

updated_annotations = map_updated_class_dict_to_updated_annotations(updated_annotations, updated_reverse_class_dict)

len(updated_class_names), updated_class_names[:10]

(233,
 ['almond_butter',
  'almonds',
  'apple_green',
  'apple_red',
  'apricot',
  'asparagus',
  'avocado',
  'bacon',
  'bacon_and_egg_burger',
  'bagel'])

In [144]:
updated_annotations

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,144,test,,,,,,internet_download
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,144,test,,,,,,internet_download
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,144,test,,,,,,internet_download
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,144,test,,,,,,internet_download
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,144,test,,,,,,internet_download
...,...,...,...,...,...,...,...,...,...,...,...
108734,all_nutrify_vegfru/onion_brown/dcfb12aa-529a-4...,dcfb12aa-529a-4981-bd04-bb89b1fec91e.jpeg,onion_brown,137,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
108735,all_nutrify_vegfru/onion_brown/56092377-8538-4...,56092377-8538-46db-a270-930b11cd0798.jpeg,onion_brown,137,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
108736,all_nutrify_vegfru/onion_brown/2abe0c8a-d934-4...,2abe0c8a-d934-4a13-8105-13d5b69ab95f.jpeg,onion_brown,137,test,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset
108737,all_nutrify_vegfru/onion_brown/dc0cccea-9c94-4...,dc0cccea-9c94-41c9-9664-02be13017ae0.jpeg,onion_brown,137,train,,,,2023-02-27_14-38-31,vegfru_dataset_with_manual_filtering,vegfru_dataset


In [145]:
updated_annotations.label.value_counts()

91     1828
146    1815
74     1812
84     1697
121    1633
       ... 
39       50
38       50
41       45
40       43
174       1
Name: label, Length: 233, dtype: int64

In [124]:
# Find the class names with under 1000 images (sorted descending)
updated_annotations.class_name.value_counts()[updated_annotations.class_name.value_counts() < 1000].sort_values(ascending=True).to_dict()

{'potato_purple_sweet': 1,
 'capsicum_red': 43,
 'capsicum_yellow': 45,
 'capsicum_green': 50,
 'capsicum_orange': 50,
 'fruit_smoothie': 60,
 'prosciutto': 62,
 'curry_chicken': 64,
 'bread_naan': 67,
 'cabbage_green': 80,
 'potato_red': 93,
 'guacamole': 100,
 'beans': 101,
 'cream': 104,
 'lettuce_oakleaf': 112,
 'chocolate': 123,
 'butter': 124,
 'kimchi': 125,
 'bagel': 131,
 'pawpaw': 133,
 'turnip': 136,
 'bacon': 140,
 'toast': 142,
 'popcorn': 142,
 'corn_chips': 146,
 'olive_oil': 147,
 'starfruit': 148,
 'tofu': 152,
 'cheese': 153,
 'sushi': 154,
 'honey': 156,
 'yoghurt': 164,
 'salt': 165,
 'mushroom_oyster': 166,
 'fries': 167,
 'salami': 169,
 'salmon': 171,
 'rice': 171,
 'soy_sauce': 172,
 'baklava': 172,
 'soda': 173,
 'pork_loins': 173,
 'burrito': 174,
 'ham': 175,
 'squash': 175,
 'maple_syrup': 175,
 'sports_drink': 176,
 'pork_chop': 177,
 'rockmelon': 177,
 'chocolate_cake': 178,
 'sandwich': 179,
 'wine_white': 179,
 'croissant': 180,
 'seaweed': 180,
 'salsa'

In [130]:
# Calculate the class weights for each class
import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_counts = np.bincount(updated_annotations.label)
class_weights = {i: 1. / count for i, count in enumerate(class_counts)}
class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
class_weights_tensor

tensor([5.4054e-03, 1.1682e-03, 1.5221e-03, 7.5930e-04, 1.1806e-03, 1.0000e-03,
        1.0020e-03, 7.1429e-03, 4.9505e-03, 7.6336e-03, 5.8140e-03, 6.4392e-04,
        4.6296e-03, 5.0505e-03, 1.4925e-03, 9.9010e-03, 4.8780e-03, 5.4945e-03,
        5.1546e-03, 4.7619e-03, 5.2632e-03, 1.7953e-03, 5.2356e-03, 5.0000e-03,
        1.2870e-03, 9.8619e-04, 4.0323e-03, 1.4925e-02, 1.7422e-03, 2.9070e-03,
        5.4348e-03, 1.2469e-03, 5.7471e-03, 8.0645e-03, 1.2500e-02, 2.0408e-03,
        5.1546e-03, 5.4945e-03, 2.0000e-02, 2.0000e-02, 2.3256e-02, 2.2222e-02,
        9.1827e-04, 1.2136e-03, 1.4045e-03, 1.2563e-03, 6.5359e-03, 5.3191e-03,
        6.6845e-04, 4.5662e-03, 4.8077e-03, 4.6296e-03, 1.0152e-03, 4.6512e-03,
        4.2194e-03, 8.1301e-03, 5.6180e-03, 1.6694e-03, 5.1813e-03, 3.8462e-03,
        4.4053e-03, 1.3004e-03, 8.2919e-04, 6.8493e-03, 9.6154e-03, 5.5556e-03,
        4.6083e-03, 8.9286e-04, 5.3763e-03, 1.5625e-02, 4.9505e-03, 5.4054e-03,
        5.0761e-03, 1.2500e-03, 5.5188e-

In [105]:
# How many unique labels are there?
len(updated_annotations.label.unique())

233

In [15]:
# from utils.misc import check_for_differences_between_df

original_annotations = annotations.copy()

def check_for_differences_between_df(df1, df2, columns_to_exclude: list=None):
    """Checks for differences between two dataframes, returns the number of differences"""
    # Find the intersection of the columns
    intersecting_columns = list(df1.columns.intersection(df2.columns))

    print(f"Number of intersecting columns: {len(intersecting_columns)}")
    print(f"Checking for differences accross the following columns: {intersecting_columns}")

    try:
        # Remove columns_to_exclude from intersecting_columns
        if columns_to_exclude is not None:
            intersecting_columns = [column for column in intersecting_columns if column not in columns_to_exclude]
        
        # Compare the values in the intersecting columns
        # See here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.compare.html 
        differences = df1[intersecting_columns].compare(df2[intersecting_columns])
        return len(differences)
    except Exception as e:
        print(f"Error: {e}")
        print(f"Couldn't compare via pandas.DataFrame.compare, trying via lengths...")
        # Compare the lengths of the dataframes
        if len(df1) != len(df2):
            differences = abs(len(df1) - len(df2))
            print(f"Difference in dataframe lengths: {differences} (aboslute value of {len(df1)} - {len(df2)})")
            return differences

num_differences = check_for_differences_between_df(updated_annotations, original_annotations)
num_differences

Number of intersecting columns: 11
Checking for differences accross the following columns: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
Error: Can only compare identically-labeled DataFrame objects
Couldn't compare via pandas.DataFrame.compare, trying via lengths...
Difference in dataframe lengths: 326 (aboslute value of 132198 - 132524)


326

In [16]:
config.annotations_columns_to_export

['filename',
 'image_name',
 'class_name',
 'label',
 'split',
 'clear_or_confusing',
 'whole_food_or_dish',
 'one_food_or_multiple',
 'label_last_updated_at',
 'label_source',
 'image_source']

In [17]:
updated_annotations["class_name"].value_counts()

choy_sum               2085
chervil                2040
quince                 2013
lemongrass             1971
waffles                1878
                       ... 
tarragon                  1
turmeric                  1
beef_diced                1
lettuce_oakleaf_red       1
jalapeno                  1
Name: class_name, Length: 278, dtype: int64

In [18]:
# Upload the updated annotations to Google Storage and track the changes
import os
from utils.gcp_utils import upload_to_gs, rename_blob, delete_blob
from utils.wandb_utils import wandb_add_artifact_with_reference
from utils.misc import get_now_time

UPDATED_ANNOTATIONS_TARGET_FILENAME = "updated_annotations.csv"
ORIGINAL_ANNOTATIONS_TARGET_FILENAME = "annotations.csv"
GS_BUCKET_NAME = config.gs_bucket_name

# Export the updated annotations to a CSV
columns_to_export = config.annotations_columns_to_export
print(f"[INFO] Exporting the following columns to {UPDATED_ANNOTATIONS_TARGET_FILENAME}: {columns_to_export}")

# TODO: Check if the updated_annotations_reset_index and the original_annotations actually differ, if so save them and upload them, else exit
if num_differences > 0:
    print(f"[INFO] {num_differences} changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv")

    # Export the updated_annotations_reset_index to a csv
    updated_annotations[columns_to_export].to_csv(UPDATED_ANNOTATIONS_TARGET_FILENAME, index=False)

    # Upload the updated CSV to Google Storage
    upload_to_gs(bucket_name=GS_BUCKET_NAME, 
                 source_file_name=UPDATED_ANNOTATIONS_TARGET_FILENAME, 
                 destination_blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME)

    # Rename the old CSV on Google Storage
    bucket_to_move_old_annotations_to = "old_annotations"
    name_to_rename_old_annotations = os.path.join(bucket_to_move_old_annotations_to, f"{get_now_time()}_old_annotations.csv")

    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME,
                new_name=name_to_rename_old_annotations)

    # Rename the "updated_annotations.csv" on Google Storage to "annotations.csv" 
    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME,
                new_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME)

    # Track the changes in the annotations with Weights & Biases
    annotations_path_on_gcs = f"gs://{GS_BUCKET_NAME}/{ORIGINAL_ANNOTATIONS_TARGET_FILENAME}"
    wandb_add_artifact_with_reference(wandb_run=run,
                                      artifact_name="food_vision_labels",
                                      artifact_type="labels",
                                      description="Labels for FoodVision project",
                                      reference_path=annotations_path_on_gcs)
else:
    print("[INFO] No changes to annotations.csv, updated label files and original annotations are the same, try fixing/updating the label files and try again")

[INFO] Exporting the following columns to updated_annotations.csv: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
[INFO] 326 changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv
[INFO] Uploading updated_annotations.csv to updated_annotations.csv...
[INFO] Connected to Google Storage bucket: food_vision_bucket_with_object_versioning
[INFO] File updated_annotations.csv uploaded to food_vision_bucket_with_object_versioning/updated_annotations.csv.
[INFO] File size: 26401373 bytes
[INFO] Blob annotations.csv has been renamed to old_annotations/2023-03-13_15-40-25_old_annotations.csv
[INFO] Blob updated_annotations.csv has been renamed to annotations.csv
[INFO] Logging 'food_vision_labels' from 'gs://food_vision_bucket_with_object_versioning/annotations.csv' to Weights & Biases...
