# Rename classes in true labels

Some classes in the true labels are confusing.

For example, "potatoes" could be "potato_white", "potato_brown", "potato_red" etc...

Same with "onion" could be "onion_brown", "onion_white", "onion_red" etc...

This notebook will serve as a place to rename labels.

To start, I'll try "onion" -> "onion_brown".

## Download original labels from GCP/Weights & Biases

In [1]:
# Append the upper level directory to sys
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from pathlib import Path

# Get config
from configs.default_config import config

args = config

# Connect to GCP
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials(path_to_key="../utils/google-storage-key.json")
test_gcp_connection()

import wandb

# Initialize a new run
from utils.wandb_utils import wandb_load_artifact, wandb_download_and_load_labels

notes = "Changing class names to be more reflective of their food type."

run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type,
                 tags=['manual_photo_upload'],
                 notes=notes)

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)


[INFO] GCP credentials set!
[INFO] GCP connection successful! Access to GCP for saving/loading data and models available.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrdbourke[0m. Use [1m`wandb login --relogin`[0m to force relogin


[INFO] Labels directory: ./artifacts/food_vision_labels:v43
[INFO] Labels path: artifacts/food_vision_labels:v43/annotations.csv
[INFO] Working with: 269 classes


In [2]:
class_names

['almond_butter',
 'almonds',
 'apple_green',
 'apple_red',
 'apricot',
 'artichoke',
 'asparagus',
 'avocado',
 'bacon',
 'bacon_and_egg_burger',
 'bagel',
 'baklava',
 'banana',
 'banana_bread',
 'barbecue_sauce',
 'basil',
 'bean_sprouts',
 'beans',
 'beef_curry',
 'beef_diced',
 'beef_kebab',
 'beef_mince',
 'beef_patty',
 'beef_roast',
 'beef_stir_fry',
 'beer',
 'beetroot',
 'biltong',
 'blackberries',
 'blueberries',
 'bok_choy',
 'bread',
 'bread_naan',
 'broccoli',
 'broccolini',
 'brownie',
 'brussel_sprouts',
 'burrito',
 'butter',
 'cabbage_green',
 'cabbage_red',
 'calamari',
 'candy',
 'capsicum_green',
 'capsicum_orange',
 'capsicum_red',
 'capsicum_yellow',
 'carrot',
 'carrot_purple',
 'cashews',
 'cauliflower',
 'celery',
 'cheese',
 'cheeseburger',
 'cherries',
 'chicken_breast',
 'chicken_thighs',
 'chicken_wings',
 'chicory',
 'chilli',
 'chimichurri',
 'chives',
 'chocolate',
 'chocolate_cake',
 'choko',
 'coconut',
 'coffee',
 'coleslaw',
 'cookies',
 'coriander'

In [3]:
# Find the class_names with "lettuce" in them
[class_name for class_name in class_names if "lettuce" in class_name]

['lettuce_cos',
 'lettuce_iceberg',
 'lettuce_oakleaf_green',
 'lettuce_oakleaf_red']

In [4]:
len(class_dict)

269

In [5]:
len(reverse_class_dict)

269

In [6]:
# See the annotations
annotations.head()

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,165,test,,,,,,internet_download
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,165,test,,,,,,internet_download
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,165,test,,,,,,internet_download
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,165,test,,,,,,internet_download
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,165,test,,,,,,internet_download


In [7]:
# Check to see if reverse_class_dict is the same as the reverse of class_dict
reverse_class_dict == {v: k for k, v in class_dict.items()}

True

In [8]:
# Check if class_names == class_dict.keys()
class_names == sorted(list(reverse_class_dict.keys()))

True

In [9]:
# Where are class_names and list(reverse_class_dict.keys()) different?
for i, (class_name, class_dict_key) in enumerate(zip(class_names, sorted(list(reverse_class_dict.keys())))):
    if class_name != class_dict_key:
        print(f"Class name {class_name} at index {i} is different from class_dict_key {class_dict_key}")

In [10]:
# See the class names
class_names

['almond_butter',
 'almonds',
 'apple_green',
 'apple_red',
 'apricot',
 'artichoke',
 'asparagus',
 'avocado',
 'bacon',
 'bacon_and_egg_burger',
 'bagel',
 'baklava',
 'banana',
 'banana_bread',
 'barbecue_sauce',
 'basil',
 'bean_sprouts',
 'beans',
 'beef_curry',
 'beef_diced',
 'beef_kebab',
 'beef_mince',
 'beef_patty',
 'beef_roast',
 'beef_stir_fry',
 'beer',
 'beetroot',
 'biltong',
 'blackberries',
 'blueberries',
 'bok_choy',
 'bread',
 'bread_naan',
 'broccoli',
 'broccolini',
 'brownie',
 'brussel_sprouts',
 'burrito',
 'butter',
 'cabbage_green',
 'cabbage_red',
 'calamari',
 'candy',
 'capsicum_green',
 'capsicum_orange',
 'capsicum_red',
 'capsicum_yellow',
 'carrot',
 'carrot_purple',
 'cashews',
 'cauliflower',
 'celery',
 'cheese',
 'cheeseburger',
 'cherries',
 'chicken_breast',
 'chicken_thighs',
 'chicken_wings',
 'chicory',
 'chilli',
 'chimichurri',
 'chives',
 'chocolate',
 'chocolate_cake',
 'choko',
 'coconut',
 'coffee',
 'coleslaw',
 'cookies',
 'coriander'

In [56]:
len(class_names)

245

In [57]:
"onion_red" in class_names

True

In [11]:
# Make a copy of the original annotations
original_annotations = annotations.copy()

In [12]:
# Create a function to similarity match the class names (e.g. code which string is most like another string)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import dot_score
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_list_of_classes(class_names: list, model: SentenceTransformer):
    """
    Embeds a list of class names.
    """

    # Map the class_name to the embedding
    class_name_to_embedding = {class_name: embedding for class_name, embedding in zip(class_names, model.encode(class_names))}

    return class_name_to_embedding

class_name_embeddings = embed_list_of_classes(class_names, model = model)

# Create a function to similarity match the class names (e.g. code which string is most like another string)
def find_most_similar_class_name(target_class_name, class_name_embedding_dict):
    """
    Finds the most similar class name to the class_name provided.
    """
    # Get the embedding of the target_class_name
    target_class_name_embedding = model.encode([target_class_name])[0]

    # Find the top-3 most similar class_name
    most_similar_class_names = sorted(class_name_embedding_dict.keys(), key=lambda key: dot_score(class_name_embedding_dict[key], target_class_name_embedding), reverse=True)[:3]
    # most_similar_class_name = max(class_name_embedding_dict.keys(), key=lambda key: dot_score(class_name_embedding_dict[key], target_class_name_embedding))

    return most_similar_class_names
    

In [13]:
# Create a function to string match the class names (e.g. code which string is most like another string)
from difflib import SequenceMatcher

def match_string_via_sequence_matcher(target_string, string_list):
    """
    Finds the most similar string to the string provided.
    """
    # Find the top-3 most similar class_name
    most_similar_strings = sorted(string_list, key=lambda string: SequenceMatcher(None, string, target_string).ratio(), reverse=True)[:3]

    return most_similar_strings

In [14]:
match_string_via_sequence_matcher("kiwi_fruit", class_names)

['kiwi_fruit', 'kiwifruit', 'passionfruit']

In [15]:
len(class_name_embeddings)

269

In [64]:
# Find the most similar class names to each missing class
for class_name in ["lettuce_iceberg"]:
    top_3_similar_class_embeddings = find_most_similar_class_name(class_name, class_name_embeddings)
    top_3_similar_class_string_matching = match_string_via_sequence_matcher(class_name, class_names)
    print(f"Similar class names to '{class_name}' | Embedding match: {top_3_similar_class_embeddings} | String match: {top_3_similar_class_string_matching}")

Similar class names to 'lettuce_iceberg' | Embedding match: ['lettuce_iceberg', 'lettuce_iceburg', 'lettuce_cos'] | String match: ['lettuce_iceberg', 'lettuce_iceburg', 'lettuce_cos']


## Update annotations

In [16]:
target_class_name_to_relabel = "kiwifruit"
target_class_name_new_label = "kiwi_fruit"

# How many rows have the name_to_relabel
print(f"Num rows with label '{target_class_name_to_relabel}': {len(annotations[annotations['class_name'] == target_class_name_to_relabel])}")

# How many rows have the name_new_label
print(f"Num rows with label '{target_class_name_new_label}': {len(annotations[annotations['class_name'] == target_class_name_new_label])}")

Num rows with label 'kiwifruit': 4
Num rows with label 'kiwi_fruit': 1295


In [17]:
# Create updated_annotations
updated_annotations = original_annotations.copy()

updated_annotations.loc[updated_annotations["class_name"] == target_class_name_to_relabel, "class_name"] = target_class_name_new_label

In [18]:
# Apply the reverse class dict to the updated_annotations
updated_annotations['label'] = updated_annotations['class_name'].map(reverse_class_dict)

In [19]:
len(updated_annotations.class_name.unique()), len(updated_annotations.label.unique())

(268, 268)

In [20]:
# Get all the class names in updated_annotations in list
updated_class_names_list = sorted(list(updated_annotations.class_name.unique()))

# Map all updated_class_names_list to a dictionary of sequential integers
reverse_class_dict_updated = {class_name: i for i, class_name in enumerate(updated_class_names_list)}

# Apply the reverse_class_dict_updated to the updated_annotations
updated_annotations['label'] = updated_annotations['class_name'].map(reverse_class_dict_updated)

# Show the unique labels of the updated_annotations in order
np.sort(updated_annotations.label.unique())

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [30]:
# Next
# See how many differences there are between updated_annotations and original_annotations
# Upload the new annotations to GCP
# Merge new images if their class_name is in the existing class_names (of the new labels)
# Upload images to GCP
# Track images and labels in W&B
# Train a model and evaluate on new data
# Make a way in data_loader.py to load data from specific sources, e.g. manual_download etc

In [22]:
# updated_annotations.class_name.unique()

In [23]:
# TODO: move this into utils folder 
from utils.misc import check_for_differences_between_df

num_differences = check_for_differences_between_df(updated_annotations, original_annotations)
num_differences

59573

In [24]:
updated_annotations.head()

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,164,test,,,,,,internet_download
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,164,test,,,,,,internet_download
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,164,test,,,,,,internet_download
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,164,test,,,,,,internet_download
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,164,test,,,,,,internet_download


In [25]:
config.annotations_columns_to_export

['filename',
 'image_name',
 'class_name',
 'label',
 'split',
 'clear_or_confusing',
 'whole_food_or_dish',
 'one_food_or_multiple',
 'label_last_updated_at',
 'label_source',
 'image_source']

In [26]:
# Show the value counts of the class_name column
updated_annotations['class_name'].value_counts()

grapes_red    1830
papaya        1817
eggplant      1815
garlic        1713
mango         1642
              ... 
jalapeno         1
thyme            1
marjoram         1
tarragon         1
beef_diced       1
Name: class_name, Length: 268, dtype: int64

In [27]:
# Upload the updated annotations to Google Storage and track the changes
import os
from utils.gcp_utils import upload_to_gs, rename_blob, delete_blob
from utils.wandb_utils import wandb_add_artifact_with_reference
from utils.misc import get_now_time

UPDATED_ANNOTATIONS_TARGET_FILENAME = "updated_annotations.csv"
ORIGINAL_ANNOTATIONS_TARGET_FILENAME = "annotations.csv"
GS_BUCKET_NAME = config.gs_bucket_name

# Export the updated annotations to a CSV
columns_to_export = config.annotations_columns_to_export
print(f"[INFO] Exporting the following columns to {UPDATED_ANNOTATIONS_TARGET_FILENAME}: {columns_to_export}")

# TODO: Check if the updated_annotations_reset_index and the original_annotations actually differ, if so save them and upload them, else exit
if num_differences > 0:
    print(f"[INFO] {num_differences} changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv")

    # Export the updated_annotations_reset_index to a csv
    updated_annotations[columns_to_export].to_csv(UPDATED_ANNOTATIONS_TARGET_FILENAME, index=False)

    # Upload the updated CSV to Google Storage
    upload_to_gs(bucket_name=GS_BUCKET_NAME, 
                 source_file_name=UPDATED_ANNOTATIONS_TARGET_FILENAME, 
                 destination_blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME)

    # Rename the old CSV on Google Storage
    bucket_to_move_old_annotations_to = "old_annotations"
    name_to_rename_old_annotations = os.path.join(bucket_to_move_old_annotations_to, f"{get_now_time()}_old_annotations.csv")

    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME,
                new_name=name_to_rename_old_annotations)

    # Rename the "updated_annotations.csv" on Google Storage to "annotations.csv" 
    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME,
                new_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME)

    # Track the changes in the annotations with Weights & Biases
    annotations_path_on_gcs = f"gs://{GS_BUCKET_NAME}/{ORIGINAL_ANNOTATIONS_TARGET_FILENAME}"
    wandb_add_artifact_with_reference(wandb_run=run,
                                      artifact_name="food_vision_labels",
                                      artifact_type="labels",
                                      description="Labels for FoodVision project",
                                      reference_path=annotations_path_on_gcs)
else:
    print("[INFO] No changes to annotations.csv, updated label files and original annotations are the same, try fixing/updating the label files and try again")

[INFO] Exporting the following columns to updated_annotations.csv: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
[INFO] 59573 changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv
[INFO] Uploading updated_annotations.csv to updated_annotations.csv...
[INFO] Connected to Google Storage bucket: food_vision_bucket_with_object_versioning
[INFO] File updated_annotations.csv uploaded to food_vision_bucket_with_object_versioning/updated_annotations.csv.
[INFO] File size: 22214063 bytes
[INFO] Blob annotations.csv has been renamed to old_annotations/2023-03-13_09-28-34_old_annotations.csv
[INFO] Blob updated_annotations.csv has been renamed to annotations.csv
[INFO] Logging 'food_vision_labels' from 'gs://food_vision_bucket_with_object_versioning/annotations.csv' to Weights & Biases...


In [28]:
# Turn class_name and label into a dictionary
class_name_to_label_dict = dict(zip(updated_annotations.class_name, updated_annotations.label))

# Reverse and sort the dictionary
class_dict_updated = {v: k for k, v in class_name_to_label_dict.items()}

# Sort the class dict alphabetically
class_dict_updated = dict(sorted(class_dict_updated.items()))

class_dict_updated

{0: 'almond_butter',
 1: 'almonds',
 2: 'apple_green',
 3: 'apple_red',
 4: 'apricot',
 5: 'artichoke',
 6: 'asparagus',
 7: 'avocado',
 8: 'bacon',
 9: 'bacon_and_egg_burger',
 10: 'bagel',
 11: 'baklava',
 12: 'banana',
 13: 'banana_bread',
 14: 'barbecue_sauce',
 15: 'basil',
 16: 'bean_sprouts',
 17: 'beans',
 18: 'beef_curry',
 19: 'beef_diced',
 20: 'beef_kebab',
 21: 'beef_mince',
 22: 'beef_patty',
 23: 'beef_roast',
 24: 'beef_stir_fry',
 25: 'beer',
 26: 'beetroot',
 27: 'biltong',
 28: 'blackberries',
 29: 'blueberries',
 30: 'bok_choy',
 31: 'bread',
 32: 'bread_naan',
 33: 'broccoli',
 34: 'broccolini',
 35: 'brownie',
 36: 'brussel_sprouts',
 37: 'burrito',
 38: 'butter',
 39: 'cabbage_green',
 40: 'cabbage_red',
 41: 'calamari',
 42: 'candy',
 43: 'capsicum_green',
 44: 'capsicum_orange',
 45: 'capsicum_red',
 46: 'capsicum_yellow',
 47: 'carrot',
 48: 'carrot_purple',
 49: 'cashews',
 50: 'cauliflower',
 51: 'celery',
 52: 'cheese',
 53: 'cheeseburger',
 54: 'cherries',

In [29]:
# Export class_dict to JSON
import json
with open("class_dict.json", "w") as f:
    json.dump(class_dict_updated, f)

In [30]:
class_dict_updated

{0: 'almond_butter',
 1: 'almonds',
 2: 'apple_green',
 3: 'apple_red',
 4: 'apricot',
 5: 'artichoke',
 6: 'asparagus',
 7: 'avocado',
 8: 'bacon',
 9: 'bacon_and_egg_burger',
 10: 'bagel',
 11: 'baklava',
 12: 'banana',
 13: 'banana_bread',
 14: 'barbecue_sauce',
 15: 'basil',
 16: 'bean_sprouts',
 17: 'beans',
 18: 'beef_curry',
 19: 'beef_diced',
 20: 'beef_kebab',
 21: 'beef_mince',
 22: 'beef_patty',
 23: 'beef_roast',
 24: 'beef_stir_fry',
 25: 'beer',
 26: 'beetroot',
 27: 'biltong',
 28: 'blackberries',
 29: 'blueberries',
 30: 'bok_choy',
 31: 'bread',
 32: 'bread_naan',
 33: 'broccoli',
 34: 'broccolini',
 35: 'brownie',
 36: 'brussel_sprouts',
 37: 'burrito',
 38: 'butter',
 39: 'cabbage_green',
 40: 'cabbage_red',
 41: 'calamari',
 42: 'candy',
 43: 'capsicum_green',
 44: 'capsicum_orange',
 45: 'capsicum_red',
 46: 'capsicum_yellow',
 47: 'carrot',
 48: 'carrot_purple',
 49: 'cashews',
 50: 'cauliflower',
 51: 'celery',
 52: 'cheese',
 53: 'cheeseburger',
 54: 'cherries',

In [31]:
# Turn class_dict into a DataFrame
class_dict_df = pd.DataFrame(class_dict_updated.items(), columns=["label", "class_name"])

# Export to csv
class_dict_df.to_csv("class_dict.csv", index=False)

In [32]:
len(class_dict_df)

268

In [33]:
class_dict_df

Unnamed: 0,label,class_name
0,0,almond_butter
1,1,almonds
2,2,apple_green
3,3,apple_red
4,4,apricot
...,...,...
263,263,wine_red
264,264,wine_white
265,265,wombok
266,266,yoghurt
