# Verify annotations images are in Google Storage

Want all annotations to be tied to an image.

If an annotation does not link to an image, delete it.

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from pathlib import Path

# Get config
from configs.default_config import config

args = config
print(args)

GS_BUCKET = config.gs_bucket_name

# Connect to GCP
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials(path_to_key="../utils/google-storage-key.json")
test_gcp_connection()

import wandb

# Initialize a new run
from utils.wandb_utils import wandb_load_artifact, wandb_download_and_load_labels

run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type,
                 tags=['internet_image_download'],
                 notes="download images using clip-retrieval")

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)

namespace(annotations_columns_to_export=['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source'], auto_augment=True, batch_size=128, epochs=10, gs_bucket_name='food_vision_bucket_with_object_versioning', gs_image_storage_path='https://storage.cloud.google.com/food_vision_bucket_with_object_versioning/all_images/', input_size=224, label_smoothing=0.1, learning_rate=0.001, model='coatnext_nano_rw_224', num_to_try_and_autocorrect=1000, num_top_n_preds=5, path_to_gcp_credentials='utils/google-storage-key.json', path_to_label_studio_api_key='utils/label_studio_api_key.json', pretrained=True, seed=42, use_mixed_precision=True, wandb_dataset_artifact='food_vision_199_classes_images:latest', wandb_job_type='', wandb_labels_artifact='food_vision_labels:latest', wandb_model_artifact='trained_model:latest', wandb_project='test_wandb_artifacts_by_reference', wandb_run_note

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrdbourke[0m. Use [1m`wandb login --relogin`[0m to force relogin


[INFO] Labels directory: ./artifacts/food_vision_labels:v52
[INFO] Labels path: artifacts/food_vision_labels:v52/annotations.csv
[INFO] Working with: 294 classes


In [None]:
from utils.gcp_utils import upload_to_gs, get_list_of_blobs

# Get list of images already in GCP bucket
gs_image_paths = get_list_of_blobs(bucket_name=GS_BUCKET, prefix="all_images")
print(f"[INFO] There are {len(gs_image_paths)} images in the GCP bucket")

# Get the names of the images already in GCP bucket
gs_image_names = [str(blob.name).split("/")[1] for blob in gs_image_paths]
gs_image_names[:10]

In [19]:
# Get the names of the images already in GCP bucket
gs_image_names = [str(blob.name).split("/")[1] for blob in gs_image_paths]
gs_image_names[:10]

['000000000.jpg',
 '000000003.jpg',
 '000000005.jpg',
 '000000006.jpg',
 '000000008.jpg',
 '000000010.jpg',
 '000000015.jpg',
 '000000019.jpg',
 '000000025.jpg',
 '000000070.jpg']

In [20]:
annotations

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,180,test,,,,,,internet_download
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,180,test,,,,,,internet_download
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,180,test,,,,,,internet_download
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,180,test,,,,,,internet_download
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,180,test,,,,,,internet_download
...,...,...,...,...,...,...,...,...,...,...,...
148381,clip_retrieval_image_downloads/2023-03-15_10-3...,9fc7103b-d575-44d0-98d2-8124b84099b0.jpg,radicchio,227,train,,,,2023-03-15_10-47-08,clip_retrieval_laion_5b_knn,clip_retrieval_laion_5b_knn
148382,clip_retrieval_image_downloads/2023-03-15_10-3...,1c040f3f-5709-473a-81a4-eb346adcd407.jpg,radicchio,227,train,,,,2023-03-15_10-47-08,clip_retrieval_laion_5b_knn,clip_retrieval_laion_5b_knn
148383,clip_retrieval_image_downloads/2023-03-15_10-3...,6d1ce2f2-d52d-4dac-b840-64fbbe141e28.jpg,radicchio,227,train,,,,2023-03-15_10-47-08,clip_retrieval_laion_5b_knn,clip_retrieval_laion_5b_knn
148384,clip_retrieval_image_downloads/2023-03-15_10-3...,ce65912f-8638-43f8-afde-c9f97bfd9c50.jpg,radicchio,227,train,,,,2023-03-15_10-47-08,clip_retrieval_laion_5b_knn,clip_retrieval_laion_5b_knn


In [21]:
updated_annotations = annotations.copy()
len(updated_annotations)

148386

In [22]:
def check_for_differences_between_df(df1, df2, columns_to_exclude: list=None):
    """Checks for differences between two dataframes, returns the number of differences"""
    # Find the intersection of the columns
    intersecting_columns = list(df1.columns.intersection(df2.columns))

    print(f"Number of intersecting columns: {len(intersecting_columns)}")
    print(f"Checking for differences accross the following columns: {intersecting_columns}")

    try:
        # Remove columns_to_exclude from intersecting_columns
        if columns_to_exclude is not None:
            intersecting_columns = [column for column in intersecting_columns if column not in columns_to_exclude]
        
        # Compare the values in the intersecting columns
        # See here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.compare.html 
        differences = df1[intersecting_columns].compare(df2[intersecting_columns])
        return len(differences)
    except Exception as e:
        print(f"Error: {e}")
        print("Couldn't compare via pandas.DataFrame.compare, trying via lengths...")
        
        # Compare the lengths of the dataframes
        if len(df1) != len(df2):
            differences = abs(len(df1) - len(df2))
            try:
                assert differences != len(df1) and differences != len(df2), "Something went wrong, the difference in dataframe lengths is the same as one of the dataframe lengths, potentially there were no new updates?"
            except AssertionError as e:
                print(e)
                print(f"Returning 0 differences between df lengths: (df1: {len(df1)}, df2: {len(df2)})")
                return 0
            print(f"Difference in dataframe lengths: {differences} (aboslute value of {len(df1)} - {len(df2)})")
            return differences

In [23]:
len(updated_annotations)

148386

In [24]:
len(annotations)

148386

In [28]:
# Make sure annotations only have images in GCP bucket
updated_annotations = annotations[annotations['image_name'].isin(gs_image_names)]

# Check for differences in annotations
print(f"[INFO] There are {len(annotations)} annotations before filtering")
print(f"[INFO] There are {len(updated_annotations)} annotations after filtering (removed: {len(annotations) - len(updated_annotations)}))")

# from utils.misc import check_for_differences_between_df

num_differences = check_for_differences_between_df(annotations, updated_annotations)

[INFO] There are 148386 annotations before filtering
[INFO] There are 147860 annotations after filtering (removed: 526))
Number of intersecting columns: 11
Checking for differences accross the following columns: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
Error: Can only compare identically-labeled DataFrame objects
Couldn't compare via pandas.DataFrame.compare, trying via lengths...
Difference in dataframe lengths: 526 (aboslute value of 148386 - 147860)


In [29]:
# Upload the updated annotations to Google Storage and track the changes
from utils.gcp_utils import upload_to_gs, rename_blob, delete_blob
from utils.wandb_utils import wandb_add_artifact_with_reference
from utils.misc import get_now_time
import os

GS_BUCKET_NAME = config.gs_bucket_name

UPDATED_ANNOTATIONS_TARGET_FILENAME = "updated_annotations.csv"
ORIGINAL_ANNOTATIONS_TARGET_FILENAME = "annotations.csv"

# Export the updated annotations to a CSV
columns_to_export = config.annotations_columns_to_export
print(f"[INFO] Exporting the following columns to {UPDATED_ANNOTATIONS_TARGET_FILENAME}: {columns_to_export}")

# TODO: Check if the updated_annotations_reset_index and the original_annotations actually differ, if so save them and upload them, else exit
if num_differences > 0:
    print(f"[INFO] {num_differences} changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv")

    # Export the updated_annotations_reset_index to a csv
    updated_annotations[columns_to_export].to_csv(UPDATED_ANNOTATIONS_TARGET_FILENAME, index=False)

    # Upload the updated CSV to Google Storage
    upload_to_gs(bucket_name=GS_BUCKET_NAME, 
                 source_file_name=UPDATED_ANNOTATIONS_TARGET_FILENAME, 
                 destination_blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME)

    # Rename the old CSV on Google Storage
    bucket_to_move_old_annotations_to = "old_annotations"
    name_to_rename_old_annotations = os.path.join(bucket_to_move_old_annotations_to, f"{get_now_time()}_old_annotations.csv")

    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME,
                new_name=name_to_rename_old_annotations)

    # Rename the "updated_annotations.csv" on Google Storage to "annotations.csv" 
    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME,
                new_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME)

    # Track the changes in the annotations with Weights & Biases
    annotations_path_on_gcs = f"gs://{GS_BUCKET_NAME}/{ORIGINAL_ANNOTATIONS_TARGET_FILENAME}"
    wandb_add_artifact_with_reference(wandb_run=run,
                                      artifact_name="food_vision_labels",
                                      artifact_type="labels",
                                      description="Labels for FoodVision project",
                                      reference_path=annotations_path_on_gcs)
else:
    print("[INFO] No changes to annotations.csv, updated label files and original annotations are the same, try fixing/updating the label files and try again")

[INFO] Exporting the following columns to updated_annotations.csv: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
[INFO] 526 changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv
[INFO] Uploading updated_annotations.csv to updated_annotations.csv...
[INFO] Connected to Google Storage bucket: food_vision_bucket_with_object_versioning
[INFO] File updated_annotations.csv uploaded to food_vision_bucket_with_object_versioning/updated_annotations.csv.
[INFO] File size: 30297322 bytes
[INFO] Blob annotations.csv has been renamed to old_annotations/2023-03-15_12-10-32_old_annotations.csv
[INFO] Blob updated_annotations.csv has been renamed to annotations.csv
[INFO] Logging 'food_vision_labels' from 'gs://food_vision_bucket_with_object_versioning/annotations.csv' to Weights & Biases...
