# Rename classes in true labels

Some classes in the true labels are confusing.

For example, "potatoes" could be "potato_white", "potato_brown", "potato_red" etc...

Same with "onion" could be "onion_brown", "onion_white", "onion_red" etc...

This notebook will serve as a place to rename labels.

To start, I'll try "onion" -> "onion_brown".

## Download original labels from GCP/Weights & Biases

In [16]:
# Append the upper level directory to sys
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from pathlib import Path

# Get config
from configs.default_config import config

args = config

# Connect to GCP
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials(path_to_key="../utils/google-storage-key.json")
test_gcp_connection()

import wandb

# Initialize a new run
from utils.wandb_utils import wandb_load_artifact, wandb_download_and_load_labels

notes = "Changing class names to be more reflective of their food type."

run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type,
                 tags=['manual_photo_upload'],
                 notes=notes)

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)


[INFO] GCP credentials set!
[INFO] GCP connection successful! Access to GCP for saving/loading data and models available.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[INFO] Labels directory: ./artifacts/food_vision_labels:v17
[INFO] Labels path: artifacts/food_vision_labels:v17/annotations.csv
[INFO] Working with: 199 classes


In [17]:
# See the annotations
annotations.head()

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,121,test,,,,,,internet_download
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,121,test,,,,,,internet_download
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,121,test,,,,,,internet_download
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,121,test,,,,,,internet_download
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,121,test,,,,,,internet_download


In [18]:
# See the class names
class_names

['almond_butter',
 'almonds',
 'apple',
 'apricot',
 'asparagus',
 'avocado',
 'bacon',
 'bacon_and_egg_burger',
 'bagel',
 'baklava',
 'banana',
 'banana_bread',
 'barbecue_sauce',
 'beans',
 'beef',
 'beef_curry',
 'beef_mince',
 'beef_stir_fry',
 'beer',
 'beetroot',
 'biltong',
 'blackberries',
 'blueberries',
 'bok_choy',
 'bread',
 'broccoli',
 'broccolini',
 'brownie',
 'brussel_sprouts',
 'burrito',
 'butter',
 'cabbage',
 'calamari',
 'candy',
 'capsicum',
 'carrot',
 'cashews',
 'cauliflower',
 'celery',
 'cheese',
 'cheeseburger',
 'cherries',
 'chicken_breast',
 'chicken_thighs',
 'chicken_wings',
 'chilli',
 'chimichurri',
 'chocolate',
 'chocolate_cake',
 'coconut',
 'coffee',
 'coleslaw',
 'cookies',
 'coriander',
 'corn',
 'corn_chips',
 'cream',
 'croissant',
 'crumbed_chicken',
 'cucumber',
 'cupcake',
 'daikon_radish',
 'dates',
 'donuts',
 'dragonfruit',
 'eggplant',
 'eggs',
 'enoki_mushroom',
 'fennel',
 'figs',
 'french_toast',
 'fried_rice',
 'fries',
 'fruit_ju

In [19]:
new_class_names = ['apple_green',
 'apple_red',
 'avocado',
 'bacon',
 'banana',
 'banana_bread',
 'beef_stir_fry',
 'biltong',
 'blueberries',
 'bread',
 'bread_naan',
 'broccoli',
 'broccolini',
 'butter',
 'capsicum',
 'carrot',
 'cheese',
 'cheeseburger',
 'cherries',
 'chicken_thighs',
 'coffee',
 'coleslaw',
 'corn',
 'cucumber',
 'curry_chicken',
 'dates',
 'eggs',
 'fries',
 'garlic',
 'grapes',
 'green_beans',
 'honey',
 'ice_coffee',
 'kiwi_fruit',
 'lemon',
 'lime',
 'lychee',
 'mango',
 'milk',
 'mushrooms',
 'nectarines',
 'omelette',
 'onion_brown',
 'onion_red',
 'onion_white',
 'orange_juice',
 'passionfruit',
 'peach',
 'plum',
 'pomegranate',
 'porridge',
 'potato_bake',
 'potato_brown',
 'potato_white',
 'pumpkin',
 'rice',
 'roast_pork',
 'roast_potatoes',
 'steak',
 'tea',
 'tomato',
 'watermelon',
 'yoghurt',
 'zucchini']

In [20]:
# Find the classes that are missing
missing_class_names = set(new_class_names) - set(class_names)
missing_class_names

{'apple_green',
 'apple_red',
 'bread_naan',
 'curry_chicken',
 'lychee',
 'onion_brown',
 'onion_red',
 'onion_white',
 'potato_brown',
 'potato_white'}

In [21]:
# Make a copy of the original annotations
original_annotations = annotations.copy()

# Find how many of the original_annotations have the class_name onion
len(original_annotations[original_annotations['class_name'] == 'onion'])

92

In [22]:
# Create updated_annotations
updated_annotations = original_annotations.copy()

# Rename all of the class_names "onion" in updated_annotations to be "onion_brown"
updated_annotations.loc[updated_annotations['class_name'] == 'onion', 'class_name'] = 'onion_brown'

In [23]:
# How many class_names are now "onion_brown"?
len(updated_annotations[updated_annotations['class_name'] == 'onion_brown'])

92

In [24]:
# Show all the rows with the class_name "onion_brown"
updated_annotations[updated_annotations['class_name'] == 'onion_brown']

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
765,test/onion/93a96f07-105e-4b52-90f9-bf2883c5ee6...,93a96f07-105e-4b52-90f9-bf2883c5ee6c.jpg,onion_brown,117,test,,,,,,internet_download
766,test/onion/28fb5d6b-c857-4902-bcaf-7238bbf7b5c...,28fb5d6b-c857-4902-bcaf-7238bbf7b5c6.jpg,onion_brown,117,test,,,,,,internet_download
767,test/onion/b7419411-1a24-4bf4-b425-5dd6f8e1072...,b7419411-1a24-4bf4-b425-5dd6f8e1072b.jpg,onion_brown,117,test,,,,,,internet_download
768,test/onion/c442b809-26ad-4327-8e5b-dd4930f629f...,c442b809-26ad-4327-8e5b-dd4930f629f6.jpg,onion_brown,117,test,,,,,,internet_download
769,test/onion/6bb5e323-dc73-482e-8242-cbd1eb8c899...,6bb5e323-dc73-482e-8242-cbd1eb8c899b.jpg,onion_brown,117,test,,,,,,internet_download
...,...,...,...,...,...,...,...,...,...,...,...
16065,train/white_onion/bc1c7ae8-af83-4134-b3ab-fce7...,bc1c7ae8-af83-4134-b3ab-fce70b45f05d.jpg,onion_brown,117,train,,,,2023-01-23 10:50:47.306734,auto_labelled_clip_and_blip_match,internet_download
16076,train/white_onion/e548cc63-dfe3-4ef6-9b5e-84ef...,e548cc63-dfe3-4ef6-9b5e-84efef74af2c.jpg,onion_brown,117,train,,,,2023-01-30 14:19:13.441271,auto_labelled_clip_and_blip_match,internet_download
16118,train/white_onion/49957c99-1b8d-4ebf-911c-2335...,49957c99-1b8d-4ebf-911c-2335a51e2c70.jpg,onion_brown,117,train,clear,whole_food,one_food,2023-01-23 15:10:12.302054,manual_label_studio,internet_download
16127,train/white_onion/b80196eb-595f-4a7b-9cda-7f3c...,b80196eb-595f-4a7b-9cda-7f3cd31d7c27.jpg,onion_brown,117,train,,,,2023-01-23 10:50:47.306734,auto_labelled_clip_and_blip_match,internet_download


In [25]:
# Next
# See how many differences there are between updated_annotations and original_annotations
# Upload the new annotations to GCP
# Merge new images if their class_name is in the existing class_names (of the new labels)
# Upload images to GCP
# Track images and labels in W&B
# Train a model and evaluate on new data
# Make a way in data_loader.py to load data from specific sources, e.g. manual_download etc

In [28]:
# TODO: move this into utils folder 
# from utils.misc import check_for_differences_between_df

def check_for_differences_between_df(df1, df2, columns_to_exclude: list=None):
    """Checks for differences between two dataframes, returns the number of differences"""
    # Find the intersection of the columns
    intersecting_columns = list(df1.columns.intersection(df2.columns))

    # Remove columns_to_exclude from intersecting_columns
    if columns_to_exclude is not None:
        intersecting_columns = [column for column in intersecting_columns if column not in columns_to_exclude]
    
    # Compare the values in the intersecting columns
    # See here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.compare.html 
    differences = df1[intersecting_columns].compare(df2[intersecting_columns])

    # Return the number of differences
    return len(differences)

In [30]:
num_differences = check_for_differences_between_df(updated_annotations, original_annotations)
num_differences

92

In [32]:
config.annotations_columns_to_export

['filename',
 'image_name',
 'class_name',
 'label',
 'split',
 'clear_or_confusing',
 'whole_food_or_dish',
 'one_food_or_multiple',
 'label_last_updated_at',
 'label_source',
 'image_source']

In [33]:
# Upload the updated annotations to Google Storage and track the changes
import os
from utils.gcp_utils import upload_to_gs, rename_blob, delete_blob
from utils.wandb_utils import wandb_add_artifact_with_reference
from utils.misc import get_now_time

UPDATED_ANNOTATIONS_TARGET_FILENAME = "updated_annotations.csv"
ORIGINAL_ANNOTATIONS_TARGET_FILENAME = "annotations.csv"
GS_BUCKET_NAME = config.gs_bucket_name

# Export the updated annotations to a CSV
columns_to_export = config.annotations_columns_to_export
print(f"[INFO] Exporting the following columns to {UPDATED_ANNOTATIONS_TARGET_FILENAME}: {columns_to_export}")

# TODO: Check if the updated_annotations_reset_index and the original_annotations actually differ, if so save them and upload them, else exit
if num_differences > 0:
    print(f"[INFO] {num_differences} changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv")

    # Export the updated_annotations_reset_index to a csv
    updated_annotations[columns_to_export].to_csv(UPDATED_ANNOTATIONS_TARGET_FILENAME, index=False)

    # Upload the updated CSV to Google Storage
    upload_to_gs(bucket_name=GS_BUCKET_NAME, 
                 source_file_name=UPDATED_ANNOTATIONS_TARGET_FILENAME, 
                 destination_blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME)

    # Rename the old CSV on Google Storage
    bucket_to_move_old_annotations_to = "old_annotations"
    name_to_rename_old_annotations = os.path.join(bucket_to_move_old_annotations_to, f"{get_now_time()}_old_annotations.csv")

    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME,
                new_name=name_to_rename_old_annotations)

    # Rename the "updated_annotations.csv" on Google Storage to "annotations.csv" 
    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME,
                new_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME)

    # Track the changes in the annotations with Weights & Biases
    annotations_path_on_gcs = f"gs://{GS_BUCKET_NAME}/{ORIGINAL_ANNOTATIONS_TARGET_FILENAME}"
    wandb_add_artifact_with_reference(wandb_run=run,
                                      artifact_name="food_vision_labels",
                                      artifact_type="labels",
                                      description="Labels for FoodVision project",
                                      reference_path=annotations_path_on_gcs)
else:
    print("[INFO] No changes to annotations.csv, updated label files and original annotations are the same, try fixing/updating the label files and try again")

[INFO] Exporting the following columns to updated_annotations.csv: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
[INFO] 92 changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv
[INFO] Uploading updated_annotations.csv to updated_annotations.csv...
[INFO] Connected to Google Storage bucket: food_vision_bucket_with_object_versioning
[INFO] File updated_annotations.csv uploaded to food_vision_bucket_with_object_versioning/updated_annotations.csv.
[INFO] File size: 3517442 bytes
[INFO] Blob annotations.csv has been renamed to old_annotations/2023-02-08_16-50-57_old_annotations.csv
[INFO] Blob updated_annotations.csv has been renamed to annotations.csv
[INFO] Logging 'food_vision_labels' from 'gs://food_vision_bucket_with_object_versioning/annotations.csv' to Weights & Biases...
