# Rename classes in true labels

Some classes in the true labels are confusing.

For example, "potatoes" could be "potato_white", "potato_brown", "potato_red" etc...

Same with "onion" could be "onion_brown", "onion_white", "onion_red" etc...

This notebook will serve as a place to rename labels.

To start, I'll try "onion" -> "onion_brown".

## Download original labels from GCP/Weights & Biases

In [70]:
# Append the upper level directory to sys
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from pathlib import Path

# Get config
from configs.default_config import config

args = config

# Connect to GCP
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials(path_to_key="../utils/google-storage-key.json")
test_gcp_connection()

import wandb

# Initialize a new run
from utils.wandb_utils import wandb_load_artifact, wandb_download_and_load_labels

notes = "Changing class names to be more reflective of their food type."

run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type,
                 tags=['manual_photo_upload'],
                 notes=notes)

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)


[INFO] GCP credentials set!
[INFO] GCP connection successful! Access to GCP for saving/loading data and models available.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[INFO] Labels directory: ./artifacts/food_vision_labels:v22
[INFO] Labels path: artifacts/food_vision_labels:v22/annotations.csv
[INFO] Working with: 204 classes


In [71]:
# See the annotations
annotations.head()

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,121,test,,,,,,internet_download
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,121,test,,,,,,internet_download
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,121,test,,,,,,internet_download
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,121,test,,,,,,internet_download
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,121,test,,,,,,internet_download


In [72]:
# See the class names
class_names

['almond_butter',
 'almonds',
 'apple_green',
 'apple_red',
 'apricot',
 'asparagus',
 'avocado',
 'bacon',
 'bacon_and_egg_burger',
 'bagel',
 'baklava',
 'banana',
 'banana_bread',
 'barbecue_sauce',
 'beans',
 'beef',
 'beef_curry',
 'beef_mince',
 'beef_stir_fry',
 'beer',
 'beetroot',
 'biltong',
 'blackberries',
 'blueberries',
 'bok_choy',
 'bread',
 'bread_naan',
 'broccoli',
 'broccolini',
 'brownie',
 'brussel_sprouts',
 'burrito',
 'butter',
 'cabbage',
 'calamari',
 'candy',
 'capsicum',
 'carrot',
 'cashews',
 'cauliflower',
 'celery',
 'cheese',
 'cheeseburger',
 'cherries',
 'chicken_breast',
 'chicken_thighs',
 'chicken_wings',
 'chilli',
 'chimichurri',
 'chocolate',
 'chocolate_cake',
 'coconut',
 'coffee',
 'coleslaw',
 'cookies',
 'coriander',
 'corn',
 'corn_chips',
 'cream',
 'croissant',
 'crumbed_chicken',
 'cucumber',
 'cupcake',
 'curry_chicken',
 'daikon_radish',
 'dates',
 'donuts',
 'dragonfruit',
 'eggplant',
 'eggs',
 'enoki_mushroom',
 'fennel',
 'figs',

In [73]:
new_class_names = ['apple_green',
 'apple_red',
 'avocado',
 'bacon',
 'banana',
 'banana_bread',
 'beef_stir_fry',
 'biltong',
 'blueberries',
 'bread',
 'bread_naan',
 'broccoli',
 'broccolini',
 'butter',
 'capsicum',
 'carrot',
 'cheese',
 'cheeseburger',
 'cherries',
 'chicken_thighs',
 'coffee',
 'coleslaw',
 'corn',
 'cucumber',
 'curry_chicken',
 'dates',
 'eggs',
 'fries',
 'garlic',
 'grapes',
 'green_beans',
 'honey',
 'ice_coffee',
 'kiwi_fruit',
 'lemon',
 'lime',
 'lychee',
 'mango',
 'milk',
 'mushrooms',
 'nectarines',
 'omelette',
 'onion_brown',
 'onion_red',
 'onion_white',
 'orange_juice',
 'passionfruit',
 'peach',
 'plum',
 'pomegranate',
 'porridge',
 'potato_bake',
 'potato_brown',
 'potato_white',
 'pumpkin',
 'rice',
 'roast_pork',
 'roast_potatoes',
 'steak',
 'tea',
 'tomato',
 'watermelon',
 'yoghurt',
 'zucchini']

In [74]:
# Find the classes that are missing
missing_class_names = list(set(new_class_names) - set(class_names))
missing_class_names

[]

In [75]:
len(class_names)

204

In [67]:
"onion_red" in class_names

True

In [68]:
# Make a copy of the original annotations
original_annotations = annotations.copy()

# Find how many of the original_annotations have the class_name onion
for class_name in missing_class_names:
    print(f"Number of rows with class name '{class_name}': {len(original_annotations[original_annotations['class_name'] == class_name])}")

Number of rows with class name 'lychee': 0
Number of rows with class name 'bread_naan': 0
Number of rows with class name 'apple_green': 0
Number of rows with class name 'curry_chicken': 0
Number of rows with class name 'potato_white': 0


In [50]:
# Create a function to similarity match the class names (e.g. code which string is most like another string)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import dot_score
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_list_of_classes(class_names: list, model: SentenceTransformer):
    """
    Embeds a list of class names.
    """

    # Map the class_name to the embedding
    class_name_to_embedding = {class_name: embedding for class_name, embedding in zip(class_names, model.encode(class_names))}

    return class_name_to_embedding

class_name_embeddings = embed_list_of_classes(class_names, model = model)

# Create a function to similarity match the class names (e.g. code which string is most like another string)
def find_most_similar_class_name(target_class_name, class_name_embedding_dict):
    """
    Finds the most similar class name to the class_name provided.
    """
    # Get the embedding of the target_class_name
    target_class_name_embedding = model.encode([target_class_name])[0]

    # Find the top-3 most similar class_name
    most_similar_class_names = sorted(class_name_embedding_dict.keys(), key=lambda key: dot_score(class_name_embedding_dict[key], target_class_name_embedding), reverse=True)[:3]
    # most_similar_class_name = max(class_name_embedding_dict.keys(), key=lambda key: dot_score(class_name_embedding_dict[key], target_class_name_embedding))

    return most_similar_class_names
    

In [51]:
# Create a function to string match the class names (e.g. code which string is most like another string)
from difflib import SequenceMatcher

def match_string_via_sequence_matcher(target_string, string_list):
    """
    Finds the most similar string to the string provided.
    """
    # Find the top-3 most similar class_name
    most_similar_strings = sorted(string_list, key=lambda string: SequenceMatcher(None, string, target_string).ratio(), reverse=True)[:3]

    return most_similar_strings

In [52]:
match_string_via_sequence_matcher("apple_green", class_names)

['apple', 'maple_syrup', 'onion_green']

In [53]:
len(class_name_embeddings)

199

In [54]:
# Find the most similar class names to each missing class
for class_name in missing_class_names:
    top_3_similar_class_embeddings = find_most_similar_class_name(class_name, class_name_embeddings)
    top_3_similar_class_string_matching = match_string_via_sequence_matcher(class_name, class_names)
    print(f"Similar class names to '{class_name}' | Embedding match: {top_3_similar_class_embeddings} | String match: {top_3_similar_class_string_matching}")

Similar class names to 'lychee' | Embedding match: ['rosemary', 'white_wine', 'beetroot'] | String match: ['cheese', 'leek', 'cherries']
Similar class names to 'bread_naan' | Embedding match: ['banana_bread', 'bread', 'garlic_bread'] | String match: ['bread', 'red_wine', 'beans']
Similar class names to 'apple_green' | Embedding match: ['onion_green', 'green_beans', 'fruit_juice'] | String match: ['apple', 'maple_syrup', 'onion_green']
Similar class names to 'apple_red' | Embedding match: ['onion_red', 'fruit_juice', 'red_wine'] | String match: ['apple', 'maple_syrup', 'garlic_bread']
Similar class names to 'curry_chicken' | Embedding match: ['beef_curry', 'chicken_breast', 'karaage_chicken'] | String match: ['crumbed_chicken', 'karaage_chicken', 'chicken_wings']
Similar class names to 'potato_white' | Embedding match: ['sweet_potato', 'potato_bake', 'mashed_potato'] | String match: ['potato_chips', 'potatoes', 'onion_white']
Similar class names to 'potato_brown' | Embedding match: ['sw

In [55]:
# Create updated_annotations
updated_annotations = original_annotations.copy()

# Rename apple -> apple_red
updated_annotations.loc[updated_annotations['class_name'] == 'apple', 'class_name'] = 'apple_red'

# Rename potatoes -> potato_brown
updated_annotations.loc[updated_annotations['class_name'] == 'potatoes', 'class_name'] = 'potato_brown'

In [57]:
# Next
# See how many differences there are between updated_annotations and original_annotations
# Upload the new annotations to GCP
# Merge new images if their class_name is in the existing class_names (of the new labels)
# Upload images to GCP
# Track images and labels in W&B
# Train a model and evaluate on new data
# Make a way in data_loader.py to load data from specific sources, e.g. manual_download etc

In [59]:
# TODO: move this into utils folder 
from utils.misc import check_for_differences_between_df

num_differences = check_for_differences_between_df(updated_annotations, original_annotations)
num_differences

185

In [60]:
config.annotations_columns_to_export

['filename',
 'image_name',
 'class_name',
 'label',
 'split',
 'clear_or_confusing',
 'whole_food_or_dish',
 'one_food_or_multiple',
 'label_last_updated_at',
 'label_source',
 'image_source']

In [61]:
# Upload the updated annotations to Google Storage and track the changes
import os
from utils.gcp_utils import upload_to_gs, rename_blob, delete_blob
from utils.wandb_utils import wandb_add_artifact_with_reference
from utils.misc import get_now_time

UPDATED_ANNOTATIONS_TARGET_FILENAME = "updated_annotations.csv"
ORIGINAL_ANNOTATIONS_TARGET_FILENAME = "annotations.csv"
GS_BUCKET_NAME = config.gs_bucket_name

# Export the updated annotations to a CSV
columns_to_export = config.annotations_columns_to_export
print(f"[INFO] Exporting the following columns to {UPDATED_ANNOTATIONS_TARGET_FILENAME}: {columns_to_export}")

# TODO: Check if the updated_annotations_reset_index and the original_annotations actually differ, if so save them and upload them, else exit
if num_differences > 0:
    print(f"[INFO] {num_differences} changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv")

    # Export the updated_annotations_reset_index to a csv
    updated_annotations[columns_to_export].to_csv(UPDATED_ANNOTATIONS_TARGET_FILENAME, index=False)

    # Upload the updated CSV to Google Storage
    upload_to_gs(bucket_name=GS_BUCKET_NAME, 
                 source_file_name=UPDATED_ANNOTATIONS_TARGET_FILENAME, 
                 destination_blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME)

    # Rename the old CSV on Google Storage
    bucket_to_move_old_annotations_to = "old_annotations"
    name_to_rename_old_annotations = os.path.join(bucket_to_move_old_annotations_to, f"{get_now_time()}_old_annotations.csv")

    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME,
                new_name=name_to_rename_old_annotations)

    # Rename the "updated_annotations.csv" on Google Storage to "annotations.csv" 
    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME,
                new_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME)

    # Track the changes in the annotations with Weights & Biases
    annotations_path_on_gcs = f"gs://{GS_BUCKET_NAME}/{ORIGINAL_ANNOTATIONS_TARGET_FILENAME}"
    wandb_add_artifact_with_reference(wandb_run=run,
                                      artifact_name="food_vision_labels",
                                      artifact_type="labels",
                                      description="Labels for FoodVision project",
                                      reference_path=annotations_path_on_gcs)
else:
    print("[INFO] No changes to annotations.csv, updated label files and original annotations are the same, try fixing/updating the label files and try again")

[INFO] Exporting the following columns to updated_annotations.csv: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
[INFO] 185 changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv
[INFO] Uploading updated_annotations.csv to updated_annotations.csv...
[INFO] Connected to Google Storage bucket: food_vision_bucket_with_object_versioning
[INFO] File updated_annotations.csv uploaded to food_vision_bucket_with_object_versioning/updated_annotations.csv.
[INFO] File size: 3690016 bytes
[INFO] Blob annotations.csv has been renamed to old_annotations/2023-02-13_09-54-16_old_annotations.csv
[INFO] Blob updated_annotations.csv has been renamed to annotations.csv
[INFO] Logging 'food_vision_labels' from 'gs://food_vision_bucket_with_object_versioning/annotations.csv' to Weights & Biases...


In [77]:
# Export class_dict to JSON
import json
with open("class_dict.json", "w") as f:
    json.dump(class_dict, f)

In [78]:
# Turn class_dict into a DataFrame
class_dict_df = pd.DataFrame(class_dict.items(), columns=['class_name', 'class_id'])

# Export to csv
class_dict_df.to_csv("class_dict.csv", index=False)