## Manually label and upload photos to the database

* **Input:** folder with classifc image classification format:

```
images/
    class_1/
        image_1.jpg
        image_2.jpg
        ...
    class_2/
        image_3.jpeg
        image_4.jpeg
        ...
    ...
```

* **Output:** Labelled images stored in GCP and tracked with Weights & Biases Artifacts. 


In [2]:
# Append the upper level directory to sys
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from pathlib import Path

In [3]:
# !rm -rf food_photos/*
# !rm -rf _MACOSX
# !rm -rf _MACOSX/*

In [4]:
# !unzip -q 2023-02-15-food_photos.zip

In [5]:
# Get list of all paths in food_photos directory
target_dir = '2023-03-08-food_photos'
path = Path(target_dir)
all_paths = sorted(list(path.glob('*/*.jp*g')))
all_paths[:10]

[PosixPath('2023-03-08-food_photos/almonds/2294afec-0c7e-4c8f-ac92-1ada00567573.jpeg'),
 PosixPath('2023-03-08-food_photos/almonds/62cbd33b-6219-4594-9375-32b177d77a20.jpeg'),
 PosixPath('2023-03-08-food_photos/almonds/85354abd-c96b-4528-9db1-68965590c803.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_green/978041ba-6e9d-4046-8330-e6619b2b7b17.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_green/a07d8896-0dc5-4ea4-9c25-47be12ee0728.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_red/1cd9683f-4da4-48b3-a402-da9d7bc89553.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_red/ff425b65-9c41-466f-8e2a-b0d84af4de87.jpeg'),
 PosixPath('2023-03-08-food_photos/asparagus/0b59353f-a55b-4b2a-bf7d-d6a5b07373aa.jpeg'),
 PosixPath('2023-03-08-food_photos/avocado/0abde6b1-6b32-48a4-a9a0-9c9865bc6de3.jpeg'),
 PosixPath('2023-03-08-food_photos/avocado/3288b26e-5a13-420c-9af0-43edf5914b47.jpeg')]

In [6]:
len(all_paths)

434

In [7]:
import uuid

def is_valid_uuid(string):
    try:
        uuid.UUID(string)
        return True
    except ValueError:
        return False

filename = "some-file-name"
if is_valid_uuid(filename):
    print(f"{filename} is a valid UUID")
else:
    print(f"{filename} is not a valid UUID")

some-file-name is not a valid UUID


In [8]:
# Loop through all_paths, if the path is a UUID, pass, if not, rename the file to a UUID
for path in all_paths:
    if not is_valid_uuid(path.stem):
        print(f"[INFO] {path} isn't valud UUID, renaming to UUID...")
        new_name = f"{uuid.uuid4()}.jpeg"
        new_path_name = path.parent / new_name
        path.rename(new_path_name)
        print(f"[INFO] Renamed {path} to {new_path_name}")

# Get list of all paths in food_photos directory
path = Path(target_dir)
all_paths = sorted(list(path.glob('*/*.jpeg')))
all_paths[:10]

[PosixPath('2023-03-08-food_photos/almonds/2294afec-0c7e-4c8f-ac92-1ada00567573.jpeg'),
 PosixPath('2023-03-08-food_photos/almonds/62cbd33b-6219-4594-9375-32b177d77a20.jpeg'),
 PosixPath('2023-03-08-food_photos/almonds/85354abd-c96b-4528-9db1-68965590c803.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_green/978041ba-6e9d-4046-8330-e6619b2b7b17.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_green/a07d8896-0dc5-4ea4-9c25-47be12ee0728.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_red/1cd9683f-4da4-48b3-a402-da9d7bc89553.jpeg'),
 PosixPath('2023-03-08-food_photos/apple_red/ff425b65-9c41-466f-8e2a-b0d84af4de87.jpeg'),
 PosixPath('2023-03-08-food_photos/asparagus/0b59353f-a55b-4b2a-bf7d-d6a5b07373aa.jpeg'),
 PosixPath('2023-03-08-food_photos/avocado/0abde6b1-6b32-48a4-a9a0-9c9865bc6de3.jpeg'),
 PosixPath('2023-03-08-food_photos/avocado/3288b26e-5a13-420c-9af0-43edf5914b47.jpeg')]

In [9]:
# Find the number of .jpeg images in each subfolder of food_photos and create a dictionary mapping the subfolder name to the number of images
subfolder_to_num_images = {}
for p in all_paths:
    subfolder = p.parent.stem
    if subfolder in subfolder_to_num_images:
        subfolder_to_num_images[subfolder] += 1
    else:
        subfolder_to_num_images[subfolder] = 1
subfolder_to_num_images

{'almonds': 3,
 'apple_green': 2,
 'apple_red': 2,
 'asparagus': 1,
 'avocado': 9,
 'bacon': 4,
 'banana': 8,
 'banana_bread': 2,
 'basil': 5,
 'beef_mince': 7,
 'beer': 1,
 'blackberries': 2,
 'blueberries': 1,
 'bok_choy': 1,
 'bread': 11,
 'bread_naan': 6,
 'broccoli': 1,
 'broccolini': 6,
 'brussel_sprouts': 1,
 'butter': 2,
 'cabbage_green': 4,
 'cabbage_red': 1,
 'capsicum_green': 2,
 'capsicum_red': 5,
 'capsicum_yellow': 4,
 'carrot': 4,
 'carrot_purple': 2,
 'cauliflower': 3,
 'celery': 1,
 'cheese': 7,
 'chilli': 3,
 'chocolate': 2,
 'cookies': 1,
 'coriander': 1,
 'corn_chips': 1,
 'croissant': 1,
 'cucumber': 11,
 'curry_chicken': 3,
 'dill': 1,
 'donuts': 6,
 'dragonfruit': 6,
 'eggplant': 1,
 'eggs': 5,
 'fennel': 1,
 'figs': 1,
 'fruit_salad': 4,
 'fruit_smoothie': 1,
 'garlic': 11,
 'garlic_chives': 2,
 'ginger': 3,
 'grapefruit': 1,
 'grapes_red': 1,
 'grapes_white': 1,
 'green_beans': 4,
 'ham': 4,
 'honey': 6,
 'ice_coffee': 1,
 'ice_cream': 6,
 'kale': 10,
 'kiwifru

In [10]:
len(subfolder_to_num_images)

138

In [11]:
# Get all food_types from all_paths
food_types = sorted(list(set([p.parent.name for p in all_paths])))
food_types

['almonds',
 'apple_green',
 'apple_red',
 'asparagus',
 'avocado',
 'bacon',
 'banana',
 'banana_bread',
 'basil',
 'beef_mince',
 'beer',
 'blackberries',
 'blueberries',
 'bok_choy',
 'bread',
 'bread_naan',
 'broccoli',
 'broccolini',
 'brussel_sprouts',
 'butter',
 'cabbage_green',
 'cabbage_red',
 'capsicum_green',
 'capsicum_red',
 'capsicum_yellow',
 'carrot',
 'carrot_purple',
 'cauliflower',
 'celery',
 'cheese',
 'chilli',
 'chocolate',
 'cookies',
 'coriander',
 'corn_chips',
 'croissant',
 'cucumber',
 'curry_chicken',
 'dill',
 'donuts',
 'dragonfruit',
 'eggplant',
 'eggs',
 'fennel',
 'figs',
 'fruit_salad',
 'fruit_smoothie',
 'garlic',
 'garlic_chives',
 'ginger',
 'grapefruit',
 'grapes_red',
 'grapes_white',
 'green_beans',
 'ham',
 'honey',
 'ice_coffee',
 'ice_cream',
 'kale',
 'kiwifruit',
 'leek',
 'lemon',
 'lettuce_cos',
 'lettuce_iceberg',
 'lime',
 'lychee',
 'mandarin',
 'mango',
 'marjoram',
 'melon_candy',
 'melon_dino',
 'melon_spanish',
 'milk',
 'mint'

## Download original labels from Weights & Biases

In [12]:
# Get config
from configs.default_config import config

args = config
print(args)

namespace(annotations_columns_to_export=['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source'], auto_augment=True, batch_size=128, epochs=10, gs_bucket_name='food_vision_bucket_with_object_versioning', gs_image_storage_path='https://storage.cloud.google.com/food_vision_bucket_with_object_versioning/all_images/', input_size=224, label_smoothing=0.1, learning_rate=0.001, model='coatnext_nano_rw_224', num_to_try_and_autocorrect=1000, num_top_n_preds=5, path_to_gcp_credentials='utils/google-storage-key.json', path_to_label_studio_api_key='utils/label_studio_api_key.json', pretrained=True, seed=42, use_mixed_precision=True, wandb_dataset_artifact='food_vision_199_classes_images:latest', wandb_job_type='', wandb_labels_artifact='food_vision_labels:latest', wandb_model_artifact='trained_model:latest', wandb_project='test_wandb_artifacts_by_reference', wandb_run_note

In [13]:
# Connect to GCP
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials(path_to_key="../utils/google-storage-key.json")
test_gcp_connection()

[INFO] GCP credentials set!
[INFO] GCP connection successful! Access to GCP for saving/loading data and models available.


In [14]:
import wandb

# Initialize a new run
from utils.wandb_utils import wandb_load_artifact, wandb_download_and_load_labels

notes = f"add {len(all_paths)} manually taken photos to the training dataset"

run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type,
                 tags=['manual_photo_upload'],
                 notes=notes)

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrdbourke[0m. Use [1m`wandb login --relogin`[0m to force relogin


[INFO] Labels directory: ./artifacts/food_vision_labels:v42
[INFO] Labels path: artifacts/food_vision_labels:v42/annotations.csv
[INFO] Working with: 252 classes


In [15]:
len(annotations.label.unique())

252

In [16]:
# Make a copy of the annotations
original_annotations = annotations.copy()

In [17]:
# Get the columns of the annotations
columns_to_create = list(original_annotations.columns)
columns_to_create

['filename',
 'image_name',
 'class_name',
 'label',
 'split',
 'clear_or_confusing',
 'whole_food_or_dish',
 'one_food_or_multiple',
 'label_last_updated_at',
 'label_source',
 'image_source']

In [18]:
# Get all the image paths from food_photos
image_paths = list(Path(target_dir).glob('*/*.jpeg'))
len(image_paths)

434

In [19]:
# Check to see what food_types are in class_names
food_types_in_class_names = [food_type for food_type in food_types if food_type in class_names]

# Check to see what food_types are not in class_names
food_types_not_in_class_names = [food_type for food_type in food_types if food_type not in class_names]
len(food_types_in_class_names)

121

In [106]:
food_types_in_class_names

['almonds', 'cashews', 'macadamia', 'pecans', 'pistachios', 'walnuts']

In [20]:
food_types_not_in_class_names

['carrot_purple',
 'dill',
 'fruit_salad',
 'garlic_chives',
 'kiwifruit',
 'marjoram',
 'melon_candy',
 'melon_dino',
 'melon_spanish',
 'oregano',
 'plum_sugar',
 'pumpkin_butternut',
 'rocket',
 'sage',
 'silverbeet',
 'tarragon',
 'thyme']

In [21]:
# Check to see if "macadamia" in class_names
[class_name for class_name in class_names if "pis" in class_name]

['pistachio']

In [22]:
# How to slot new classes into existing class_dict?
# Then update the existing annotations with the new class labels (e.g. "apple_green" -> 2, "apple_red" -> 3)

# Add new classes to class_dict
for food_type in food_types_not_in_class_names:
    class_dict[len(class_dict)] = food_type
    class_names.append(food_type)
    reverse_class_dict[food_type] = len(class_dict)-1

In [23]:
len(class_dict)

269

In [24]:
from utils.misc import get_now_time

# Create a list of dictionaries and fill out the columns of the annotations
columns = ['filename',
 'image_name',
 'class_name',
 'label',
 'split',
 'clear_or_confusing',
 'whole_food_or_dish',
 'one_food_or_multiple',
 'label_last_updated_at',
 'label_source',
 'image_source']

food_image_dict_list = []

LABEL_SOURCE = "manual_upload"
IMAGE_SOURCE = "manual_upload"
LABEL_LAST_UPDATED_AT = get_now_time()

for image_path in image_paths:
    food_image_dict = {}
    food_image_dict['filename'] = image_path
    food_image_dict['image_name'] = image_path.name
    food_image_dict['class_name'] = image_path.parent.name
    food_image_dict['label'] = reverse_class_dict[image_path.parent.name]
    # food_image_dict['split'] = 'train'
    # Label 20% of the images as test
    if np.random.random() < 0.2:
        food_image_dict['split'] = 'test'
    else:
        food_image_dict['split'] = 'train'
    food_image_dict['clear_or_confusing'] = 'clear'
    food_image_dict['whole_food_or_dish'] = 'whole_food'
    food_image_dict['one_food_or_multiple'] = 'one_food'
    food_image_dict['label_last_updated_at'] = LABEL_LAST_UPDATED_AT
    food_image_dict['label_source'] = LABEL_SOURCE
    food_image_dict['image_source'] = IMAGE_SOURCE

    food_image_dict_list.append(food_image_dict)

# Create a dataframe from the list of dictionaries
import pandas as pd
new_annotations = pd.DataFrame(food_image_dict_list)
new_annotations.head()

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,2023-03-08-food_photos/wombok/b0ee4f8f-62d5-49...,b0ee4f8f-62d5-490c-9a01-6b0af3f8a66a.jpeg,wombok,249,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
1,2023-03-08-food_photos/wombok/ff42e0c7-d1d4-46...,ff42e0c7-d1d4-46a0-a587-d6529badb591.jpeg,wombok,249,test,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
2,2023-03-08-food_photos/wombok/0713c1f2-51b2-49...,0713c1f2-51b2-49ea-812b-88098aadfbd5.jpeg,wombok,249,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
3,2023-03-08-food_photos/pain_au_chocolat/0f9386...,0f938616-5a71-4248-8856-d785076512bf.jpeg,pain_au_chocolat,155,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
4,2023-03-08-food_photos/pain_au_chocolat/78b873...,78b87304-15b2-444f-96c6-16393d692e8c.jpeg,pain_au_chocolat,155,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload


In [25]:
# Print how many images are train/test in new_annotations
new_annotations['split'].value_counts()

train    363
test      71
Name: split, dtype: int64

In [26]:
# Find a random sample of 10 images where the label value is 199 or higher
new_annotations[new_annotations['label'] >= 199].sample(10)

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
11,2023-03-08-food_photos/melon_candy/5215b380-ac...,5215b380-acf2-4079-8e41-4c1415013c16.jpeg,melon_candy,258,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
253,2023-03-08-food_photos/fruit_salad/f4e01cd8-ec...,f4e01cd8-ec04-495c-9a34-d6969b1a1936.jpeg,fruit_salad,254,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
254,2023-03-08-food_photos/fruit_salad/ecd43b88-8e...,ecd43b88-8e8b-4ad0-bbf9-696334fef50c.jpeg,fruit_salad,254,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
233,2023-03-08-food_photos/garlic_chives/a089ae06-...,a089ae06-0476-4c96-bcf6-4a6893db6b90.jpeg,garlic_chives,255,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
252,2023-03-08-food_photos/fruit_salad/48ffd0e6-81...,48ffd0e6-81fe-40c6-9f9a-8135506a916e.jpeg,fruit_salad,254,test,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
265,2023-03-08-food_photos/sandwich_toasted/3c70b0...,3c70b073-a3d6-42b5-9cc5-ed9930d63e10.jpeg,sandwich_toasted,213,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
152,2023-03-08-food_photos/toast/2f2acf73-4493-4e3...,2f2acf73-4493-4e31-b1ee-3b58d6ac6b2e.jpeg,toast,236,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
143,2023-03-08-food_photos/melon_spanish/6c5e5deb-...,6c5e5deb-23fa-4f58-8674-97f922af68f7.jpeg,melon_spanish,260,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
419,2023-03-08-food_photos/silverbeet/3671b10f-64d...,3671b10f-64d8-43b3-b0f7-8e3b3c69c385.jpeg,silverbeet,266,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
84,2023-03-08-food_photos/tarragon/9172353f-97be-...,9172353f-97be-43d9-bc53-0653bffc7e84.jpeg,tarragon,267,test,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload


In [27]:
len(original_annotations), len(new_annotations)

(114882, 434)

In [28]:
# Append the new_annotations to the original_annotations
updated_annotations = pd.concat([original_annotations, new_annotations], ignore_index=True)

# updated_annotations = original_annotations.(new_annotations, ignore_index=True)
updated_annotations

Unnamed: 0,filename,image_name,class_name,label,split,clear_or_confusing,whole_food_or_dish,one_food_or_multiple,label_last_updated_at,label_source,image_source
0,test/pain_au_chocolat/4fd7cb42-bd7f-48f1-bfdc-...,4fd7cb42-bd7f-48f1-bfdc-607c2f54b788.jpg,pain_au_chocolat,155,test,,,,,,internet_download
1,test/pain_au_chocolat/2062f52a-781c-4e4f-b8a7-...,2062f52a-781c-4e4f-b8a7-0a108934f453.jpg,pain_au_chocolat,155,test,,,,,,internet_download
2,test/pain_au_chocolat/8003e0f6-37e8-460d-9c14-...,8003e0f6-37e8-460d-9c14-e7c6fe44a37f.jpg,pain_au_chocolat,155,test,,,,,,internet_download
3,test/pain_au_chocolat/839437c8-c643-408f-9f04-...,839437c8-c643-408f-9f04-d0d3bec238c3.jpg,pain_au_chocolat,155,test,,,,,,internet_download
4,test/pain_au_chocolat/ca5c13ff-a535-4b69-9144-...,ca5c13ff-a535-4b69-9144-e06275e01e35.jpg,pain_au_chocolat,155,test,,,,,,internet_download
...,...,...,...,...,...,...,...,...,...,...,...
115311,2023-03-08-food_photos/mandarin/be4330ea-d787-...,be4330ea-d787-4f78-a919-2c6cfd3f1860.jpeg,mandarin,128,test,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
115312,2023-03-08-food_photos/ice_coffee/fb51baf8-efd...,fb51baf8-efd1-4138-9ceb-f322062a3814.jpeg,ice_coffee,108,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
115313,2023-03-08-food_photos/snow_peas/03e27f58-e4e4...,03e27f58-e4e4-461b-baee-72b519b4a64c.jpeg,snow_peas,220,test,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload
115314,2023-03-08-food_photos/onion_brown/e09cc129-fb...,e09cc129-fbaf-4ebf-9934-a1e923e08d9c.jpeg,onion_brown,148,train,clear,whole_food,one_food,2023-03-09_08-24-32,manual_upload,manual_upload


In [29]:
# How many unique class_names are in original_annotations?
len(original_annotations['class_name'].unique())

252

In [30]:
# How many unique class_names are in updated_annotations?
len(updated_annotations['class_name'].unique())

269

In [31]:
from typing import List, Dict, Tuple

def get_updated_class_names_class_dict_and_reverse_class_dict(df: pd.DataFrame) -> Tuple[List[str], Dict[int, str], Dict[str, int]]:
    """Get an updated class names list, class dict and reverse class dict from a dataframe."""
    updated_class_names = sorted(df.class_name.unique())
    updated_class_dict = {i: class_name for i, class_name in enumerate(updated_class_names)}
    updated_reverse_class_dict = {class_name: i for i, class_name in enumerate(updated_class_names)}
    assert len(updated_class_dict) == len(updated_reverse_class_dict), "Class dict and reverse class dict are not the same length"
    return updated_class_names, updated_class_dict, updated_reverse_class_dict

def map_updated_class_dict_to_updated_annotations(df: pd.DataFrame, updated_reverse_class_dict: Dict[str, int]) -> pd.DataFrame:
    """Map updated class dict to updated annotations dataframe.
    
    For example, go from {"apple_red": 1} -> df["label"] = 1
    """
    updated_annotations = df.copy()
    updated_annotations.loc[:, 'label'] = updated_annotations['class_name'].map(updated_reverse_class_dict)
    assert len(updated_annotations.label.unique()) == len(updated_reverse_class_dict), "Number of unique labels in updated annotations does not match number of unique class names in updated reverse class dict"
    return updated_annotations

In [32]:
updated_class_names, updated_class_dict, updated_reverse_class_dict = get_updated_class_names_class_dict_and_reverse_class_dict(updated_annotations)

updated_annotations = map_updated_class_dict_to_updated_annotations(updated_annotations, updated_reverse_class_dict)

len(updated_class_names), updated_class_names[:10]

(269,
 ['almond_butter',
  'almonds',
  'apple_green',
  'apple_red',
  'apricot',
  'artichoke',
  'asparagus',
  'avocado',
  'bacon',
  'bacon_and_egg_burger'])

In [33]:
# Check the difference in lengths between the original_annotations and updated_annotations
num_differences = len(updated_annotations) - len(original_annotations)
num_differences

434

In [34]:
# Upload the updated annotations to Google Storage and track the changes
from utils.gcp_utils import upload_to_gs, rename_blob, delete_blob
from utils.wandb_utils import wandb_add_artifact_with_reference
from utils.misc import get_now_time
import os

GS_BUCKET_NAME = config.gs_bucket_name

UPDATED_ANNOTATIONS_TARGET_FILENAME = "updated_annotations.csv"
ORIGINAL_ANNOTATIONS_TARGET_FILENAME = "annotations.csv"

# Export the updated annotations to a CSV
columns_to_export = config.annotations_columns_to_export
print(f"[INFO] Exporting the following columns to {UPDATED_ANNOTATIONS_TARGET_FILENAME}: {columns_to_export}")

# TODO: Check if the updated_annotations_reset_index and the original_annotations actually differ, if so save them and upload them, else exit
if num_differences > 0:
    print(f"[INFO] {num_differences} changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv")

    # Export the updated_annotations_reset_index to a csv
    updated_annotations[columns_to_export].to_csv(UPDATED_ANNOTATIONS_TARGET_FILENAME, index=False)

    # Upload the updated CSV to Google Storage
    upload_to_gs(bucket_name=GS_BUCKET_NAME, 
                 source_file_name=UPDATED_ANNOTATIONS_TARGET_FILENAME, 
                 destination_blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME)

    # Rename the old CSV on Google Storage
    bucket_to_move_old_annotations_to = "old_annotations"
    name_to_rename_old_annotations = os.path.join(bucket_to_move_old_annotations_to, f"{get_now_time()}_old_annotations.csv")

    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME,
                new_name=name_to_rename_old_annotations)

    # Rename the "updated_annotations.csv" on Google Storage to "annotations.csv" 
    rename_blob(bucket_name=GS_BUCKET_NAME,
                blob_name=UPDATED_ANNOTATIONS_TARGET_FILENAME,
                new_name=ORIGINAL_ANNOTATIONS_TARGET_FILENAME)

    # Track the changes in the annotations with Weights & Biases
    annotations_path_on_gcs = f"gs://{GS_BUCKET_NAME}/{ORIGINAL_ANNOTATIONS_TARGET_FILENAME}"
    wandb_add_artifact_with_reference(wandb_run=run,
                                      artifact_name="food_vision_labels",
                                      artifact_type="labels",
                                      description="Labels for FoodVision project",
                                      reference_path=annotations_path_on_gcs)
else:
    print("[INFO] No changes to annotations.csv, updated label files and original annotations are the same, try fixing/updating the label files and try again")

[INFO] Exporting the following columns to updated_annotations.csv: ['filename', 'image_name', 'class_name', 'label', 'split', 'clear_or_confusing', 'whole_food_or_dish', 'one_food_or_multiple', 'label_last_updated_at', 'label_source', 'image_source']
[INFO] 434 changes to annotations.csv, updated label files and original annotations are different, saving the updated annotations.csv
[INFO] Uploading updated_annotations.csv to updated_annotations.csv...
[INFO] Connected to Google Storage bucket: food_vision_bucket_with_object_versioning
[INFO] File updated_annotations.csv uploaded to food_vision_bucket_with_object_versioning/updated_annotations.csv.
[INFO] File size: 22214059 bytes
[INFO] Blob annotations.csv has been renamed to old_annotations/2023-03-09_08-24-57_old_annotations.csv
[INFO] Blob updated_annotations.csv has been renamed to annotations.csv
[INFO] Logging 'food_vision_labels' from 'gs://food_vision_bucket_with_object_versioning/annotations.csv' to Weights & Biases...


## Upload all photos in food_photos to Google Storage

In [35]:
from utils.gcp_utils import upload_to_gs, get_list_of_blobs

# Get list of images already in GCP bucket
gs_image_paths = get_list_of_blobs(bucket_name=GS_BUCKET_NAME, prefix="all_images")
print(f"[INFO] There are {len(gs_image_paths)} images in the GCP bucket")

[INFO] There are 117574 images in the GCP bucket


In [36]:
gs_image_paths_name = [str(blob.name).split("/")[-1] for blob in gs_image_paths]
gs_image_paths_name[:10]

['000226a7-5332-4f45-b0e9-6760e9bd6d3e.jpeg',
 '0003a069-3b76-4cae-9414-80ccaa081e80.jpeg',
 '0003c8a1-7f64-4540-9256-3252f0981035.jpeg',
 '00045a69-b09f-4293-8c2e-a7ba27964fb6.jpg',
 '0004a23a-88b3-4aae-a1b0-c9ebe77d31b8.jpeg',
 '00065302-c0e7-4634-ab63-5ddd16bfdeb8.jpeg',
 '000747c7-e84b-44a3-9c9d-40076efbe7c4.jpeg',
 '00081a0e-c33c-4491-9bac-434afb5968c6.jpeg',
 '0008eda8-cd90-4394-bdac-aa1574f7738d.jpeg',
 '0009bea1-1577-4db5-bc60-b27eaed2d276.jpg']

In [37]:
len(gs_image_paths_name)

117574

In [38]:
print(target_dir)

2023-03-08-food_photos


In [39]:
# Upload the images to Google Storage
!gsutil -m cp -r $target_dir/*/*.jp*g gs://$GS_BUCKET_NAME/all_images/



Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update

Copying file://2023-03-08-food_photos/almonds/2294afec-0c7e-4c8f-ac92-1ada00567573.jpeg [Content-Type=image/jpeg]...
Copying file://2023-03-08-food_photos/almonds/85354abd-c96b-4528-9db1-68965590c803.jpeg [Content-Type=image/jpeg]...
Copying file://2023-03-08-food_photos/apple_green/978041ba-6e9d-4046-8330-e6619b2b7b17.jpeg [Content-Type=image/jpeg]...
Copying file://2023-03-08-food_photos/apple_green/a07d8896-0dc5-4ea4-9c25-47be12ee0728.jpeg [Content-Type=image/jpeg]...
Copying file://2023-03-08-food_photos/almonds/62cbd33b-6219-4594-9375-32b177d77a20.jpeg [Content-Type=image/jpeg]...
Copying file://2023-03-08-food_photos/apple_red/ff425b65-9c41-466f-8e2a-b0d84af4de87.jpeg [Content-Type=image/jpeg]...
Copying file://2023-03-08-food_photos/apple_red/1cd9683f-4da4-48b3-a402-da9d7bc89553.jpeg [Content-Type=image/jpeg]...
Copying file://2023-03-08-food_photos/asparagus

In [135]:
# # Test uploading of 1 image
# one_image = image_paths[0]

# # Upload the image to Google Storage
# upload_to_gs(bucket_name=GS_BUCKET_NAME,
#              source_file_name=one_image,
#              destination_blob_name=f"all_images/{one_image.name}")

In [40]:
# Get list of images already in GCP bucket
gs_image_paths = get_list_of_blobs(bucket_name=GS_BUCKET_NAME, prefix="all_images")
print(f"[INFO] There are {len(gs_image_paths)} images in the GCP bucket")

[INFO] There are 118008 images in the GCP bucket


In [41]:
gs_image_paths_name = [str(blob.name).split("/")[-1] for blob in gs_image_paths]
gs_image_paths_name[:10]

['000226a7-5332-4f45-b0e9-6760e9bd6d3e.jpeg',
 '0003a069-3b76-4cae-9414-80ccaa081e80.jpeg',
 '0003c8a1-7f64-4540-9256-3252f0981035.jpeg',
 '00045a69-b09f-4293-8c2e-a7ba27964fb6.jpg',
 '0004a23a-88b3-4aae-a1b0-c9ebe77d31b8.jpeg',
 '00065302-c0e7-4634-ab63-5ddd16bfdeb8.jpeg',
 '000747c7-e84b-44a3-9c9d-40076efbe7c4.jpeg',
 '00081a0e-c33c-4491-9bac-434afb5968c6.jpeg',
 '0008eda8-cd90-4394-bdac-aa1574f7738d.jpeg',
 '0009bea1-1577-4db5-bc60-b27eaed2d276.jpg']

In [42]:
for image_name in image_paths:
    if "b322004c-d78e-47e9-a2c1-1d5d116b1601.jpeg" in image_name.name:
        print(f"[INFO] {image_name} in GCP bucket")
        # print(f"[INFO] {image_name} not in GCP bucket")

In [43]:
# Loop through image_paths and upload the image as long as its name isn't in gs_image_paths_name
from tqdm.auto import tqdm

num_images_uploaded = 0
for image_path in tqdm(image_paths):
    if image_path.name not in gs_image_paths_name:
        
        # Upload the image to Google Storage
        upload_to_gs(bucket_name=GS_BUCKET_NAME,
                     source_file_name=image_path,
                     destination_blob_name=f"all_images/{image_path.name}")
        
        num_images_uploaded += 1

print(f"[INFO] Uploaded {num_images_uploaded} images to GCP bucket")

  0%|          | 0/434 [00:00<?, ?it/s]

[INFO] Uploaded 0 images to GCP bucket


In [44]:
# Track the changes to GCP bucket with Weights & Biases
from utils.wandb_utils import wandb_add_artifact_with_reference

In [45]:
# Track updated images_dir in Weights & Biases
food_images_path_on_gs = "gs://food_vision_bucket_with_object_versioning/all_images"

wandb_add_artifact_with_reference(wandb_run=run,
                                  artifact_name="food_vision_199_classes_images",
                                  artifact_type="dataset",
                                  description="Images for FoodVision project",
                                  reference_path=food_images_path_on_gs)

[INFO] Logging 'food_vision_199_classes_images' from 'gs://food_vision_bucket_with_object_versioning/all_images' to Weights & Biases...


[34m[1mwandb[0m: Generating checksum for up to 1000000000 objects with prefix "all_images"... Done. 24.1s


In [46]:
# Get list of images already in GCP bucket
gs_image_paths = get_list_of_blobs(bucket_name=GS_BUCKET_NAME, prefix="all_images")
print(f"[INFO] There are {len(gs_image_paths)} images in the GCP bucket")

[INFO] There are 118008 images in the GCP bucket


In [62]:
gs_image_paths[0].name

'all_images/000226a7-5332-4f45-b0e9-6760e9bd6d3e.jpeg'

In [None]:
# Next:
# Create labels CSV for all photos in food_photos ✅
# Add labels CSV to original annotations ✅
# Upload all photos in food_photos to GCP ✅
# Track updates in W&B ✅

# Train a model and track how it performs (by only upgrading training data)