# Notebook to add tags to the various images in the FoodVision dataset

* **Goal:** Add `image_source` tag to images currently in the FoodVision dataset so I can use various images based on their source.

See: https://github.com/mrdbourke/nutrify/issues/58


In [3]:
# Print the datetime
import datetime
print(f"Last worked on: {datetime.datetime.now()}")

Last worked on: 2023-01-30 14:28:52.046833


In [17]:
import sys
sys.path.append("..") # append the upper path to access the foodvision module

from configs.default_config import config
args = config

args.wandb_job_type = "data labelling"
args.wandb_run_notes = "Add image tags to dataset"

# Set GCP credentials
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials("../utils/google-storage-key.json")
test_gcp_connection()

# Start Weights & Biases run
import wandb
run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type, 
                 tags=["add_image_tags"], 
                 notes=args.wandb_run_notes)

[INFO] GCP credentials set!
[INFO] GCP connection successful! Access to GCP for saving/loading data and models available.


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668456823875508, max=1.0…

In [18]:
# Get the current dataset
from utils.wandb_utils import wandb_download_and_load_labels

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)


[34m[1mwandb[0m:   1 of 1 files downloaded.  


[INFO] Labels directory: ./artifacts/food_vision_labels:v15
[INFO] Labels path: artifacts/food_vision_labels:v15/annotations.csv
[INFO] Working with: 199 classes


In [19]:
# Make a copy of the annotations
original_annotations = annotations.copy()

original_annotations.head()

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,121,test,,,,,
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,121,test,,,,,
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,121,test,,,,,
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,121,test,,,,,
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,121,test,,,,,


In [20]:
original_annotations.label_source.value_counts()

auto_labelled_clip_and_blip_match    680
manual_label_studio                  194
Name: label_source, dtype: int64

In [21]:
# Add a column to the DataFrame called "image_source"
original_annotations["image_source"] = "internet_download"

In [22]:
original_annotations.image_source.value_counts()

internet_download    23992
Name: image_source, dtype: int64

## Check for differences

In [23]:
# TODO: move to utils?
def check_for_differences_between_df(df1, df2, columns_to_exclude: list=None):
    """Checks for differences between two dataframes, returns the number of differences"""
    # Find the intersection of the columns
    intersecting_columns = list(df1.columns.intersection(df2.columns))

    # Remove columns_to_exclude from intersecting_columns
    if columns_to_exclude is not None:
        intersecting_columns = [column for column in intersecting_columns if column not in columns_to_exclude]
    
    # Compare the values in the intersecting columns
    # See here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.compare.html 
    differences = df1[intersecting_columns].compare(df2[intersecting_columns])

    # Return the number of differences
    return len(differences)

In [25]:
num_differences = check_for_differences_between_df(original_annotations, annotations)
num_differences

0

## Upload image labels with tags to GCS

In [28]:
updated_annotations = original_annotations.copy()

In [29]:
# TODO: Check if the updated_annotations_reset_index and the original_annotations actually differ, if so save them and upload them, else exit
from utils.gcp_utils import upload_to_gs, rename_blob
from utils.wandb_utils import wandb_add_artifact_with_reference
from utils.misc import get_now_time
import os

### Upload and save annotations
GS_BUCKET_NAME = args.gs_bucket_name
UPDATED_ANNOTATIONS_TARGET_FILENAME = "updated_annotations.csv"
ORIGINAL_ANNOTATIONS_TARGET_FILENAME = "annotations.csv"

# Export the updated_annotations_reset_index to a csv
updated_annotations.to_csv(UPDATED_ANNOTATIONS_TARGET_FILENAME, index=False)

# Upload the updated CSV to Google Storage
upload_to_gs(bucket_name=GS_BUCKET_NAME, 
                source_file_name=UPDATED_ANNOTATIONS_TARGET_FILENAME, 
                destination_blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME)

# Rename the old CSV on Google Storage
bucket_to_move_old_annotations_to = "old_annotations"
name_to_rename_old_annotations = os.path.join(bucket_to_move_old_annotations_to, f"{get_now_time()}_old_annotations.csv")

rename_blob(bucket_name=GS_BUCKET_NAME,
            blob_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME,
            new_name=name_to_rename_old_annotations)

# Rename the "updated_annotations.csv" on Google Storage to "annotations.csv" 
rename_blob(bucket_name=GS_BUCKET_NAME,
            blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME,
            new_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME)


# TODO: move this into another script? 
# TODO: make it easier to track Artifact changes
# TODO: e.g. there's a dedicated Artifact tracker file that gets run after any changes
# Always track changes to W&B (this should automatically detect if there is/isn't changes and track)
annotations_path_on_gcs = f"gs://{GS_BUCKET_NAME}/{ORIGINAL_ANNOTATIONS_TARGET_FILENAME}"
wandb_add_artifact_with_reference(wandb_run=run,
                                  artifact_name="food_vision_labels",
                                  artifact_type="labels",
                                  description="Labels for FoodVision project",
                                  reference_path=annotations_path_on_gcs)


[INFO] Uploading updated_annotations.csv to updated_annotations.csv...
[INFO] Connected to Google Storage bucket: food_vision_bucket_with_object_versioning
[INFO] File updated_annotations.csv uploaded to food_vision_bucket_with_object_versioning/updated_annotations.csv.
[INFO] File size: 3388499 bytes
[INFO] Blob annotations.csv has been renamed to old_annotations/2023-01-30_14-49-38_old_annotations.csv
[INFO] Blob updated_annotations.csv has been renamed to annotations.csv
[INFO] Logging 'food_vision_labels' from 'gs://food_vision_bucket_with_object_versioning/annotations.csv' to Weights & Biases...


wandb: Network error (ReadTimeout), entering retry loop.
