# Imports

In [1]:
import os
import pandas as pd
from google.cloud import storage
# import numpy as np
# from tqdm.auto import tqdm
# from sklearn.model_selection import train_test_split
# from matplotlib import pyplot as plt
# from PIL import Image
# import cv2
# import tensorflow as tf
# from tensorflow import keras
# from keras_cv import bounding_box
# from keras_cv import visualization
# import keras_cv
# import shutil
#os.chdir allows you to change directories, like cd in the Terminal
#os.chdir('/content/drive/MyDrive/LeWagon/Notebook')


# Load raw data

In [2]:
# Project path
project_root = os.path.dirname(os.getcwd())

# Directories path
test_image_folder = os.path.join(project_root, 'raw_data', 'image_data')
path_annot = os.path.join(project_root, 'raw_data', 'bbox_train.csv')


In [3]:
df = pd.read_csv(path_annot)
df.head()


Unnamed: 0,Name,width,height,xmin,ymin,xmax,ymax
0,10001.jpg,612,408,192,199,230,235
1,10001.jpg,612,408,247,168,291,211
2,10001.jpg,612,408,321,176,366,222
3,10001.jpg,612,408,355,183,387,214
4,10002.jpg,612,408,339,165,378,202


# Check which images have anotations. Only those ones are pushed to Google Cloud

In [4]:
annotated_images_names = df.Name.unique()


# Load data to Cloud Storage Bucket

In [5]:
# Create a Storage client
storage_client = storage.Client()

# Set the name of the bucket in Google Cloud Storage
bucket_name = 'raw_data_facetally'

# Get the bucket
bucket = storage_client.bucket(bucket_name)

# Project path
project_root = os.path.dirname(os.getcwd())

# Directories path
local_directory_images = os.path.join(project_root, 'raw_data', 'image_data')
local_directory_annotations = os.path.join(project_root, 'raw_data')

# Destination directory in the bucket
destination_directory_images = 'image_data/'  # Destination folder in the bucket

# Filter images with annotations
raw_images_names = os.listdir(local_directory_images)
filtered_images_names = []
for image in raw_images_names:
    for annotation in annotated_images_names:
        if image == annotation:
            filtered_images_names.append(image)


In [47]:
# Upload the images
print(f"Uploading images")
for count, filename in enumerate(filtered_images_names):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        local_image_path = os.path.join(local_directory_images, filename)
        destination_blob_name = os.path.join(destination_directory_images, filename)

        # Create a Blob object and upload the image file to the bucket
        blob = bucket.blob(destination_blob_name)
        if blob.exists():
            print(f"The file '{destination_blob_name}' already exists, won't be pushed. Count: {count + 1}")
        else:
            blob.upload_from_filename(local_image_path)
            print(f"Image '{filename}' uploaded to Google Cloud Storage: {destination_blob_name}. Count: {count + 1}")
print(f"Process finished")


Uploading images
The file 'image_data/15603.jpg' already exists, won't be pushed. Count: 1
The file 'image_data/13153.jpg' already exists, won't be pushed. Count: 2
The file 'image_data/12753.jpg' already exists, won't be pushed. Count: 3
The file 'image_data/15241.jpg' already exists, won't be pushed. Count: 4
The file 'image_data/10028.jpg' already exists, won't be pushed. Count: 5
The file 'image_data/16053.jpg' already exists, won't be pushed. Count: 6
The file 'image_data/17084.jpg' already exists, won't be pushed. Count: 7
The file 'image_data/16335.jpg' already exists, won't be pushed. Count: 8
The file 'image_data/16231.jpg' already exists, won't be pushed. Count: 9
The file 'image_data/14072.jpg' already exists, won't be pushed. Count: 10
The file 'image_data/16428.jpg' already exists, won't be pushed. Count: 11
The file 'image_data/17546.jpg' already exists, won't be pushed. Count: 12
The file 'image_data/17717.jpg' already exists, won't be pushed. Count: 13
The file 'image_d

In [12]:
 # List objects in the specified folder path
folder_objects = list(bucket.list_blobs(prefix=destination_directory_images))

# Count the number of objects in the folder
num_objects = len(folder_objects)
print(f"Number of objects in '{destination_directory_images}': {num_objects}")


Number of objects in 'image_data/': 5733


In [13]:
# Upload the CSV file
csv_name = 'bbox_train.csv'
local_annotations_path = os.path.join(local_directory_annotations, csv_name)
blob = bucket.blob(csv_name)
blob.upload_from_filename(local_annotations_path)
print(f"File '{local_annotations_path}' uploaded to '{csv_name}'.")


File '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/bbox_train.csv' uploaded to 'bbox_train.csv'.


# Download data from BigQuery if it doesn't exist locally

In [36]:
def download_images_from_GCP(bucket_name, folder_path, destination_folder, overwrite = False):

    # Boolean to control if anything has been downloaded
    changes = False

    # Initialize the Cloud Storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # List objects in the specified folder path
    folder_objects = bucket.list_blobs(prefix=folder_path)

    # Create a local folder if it doesn't exist
    os.makedirs(destination_folder, exist_ok=True)

    # Download each image file from the folder
    for obj in folder_objects:
        # Construct local file path for downloading
        local_file_path = os.path.join(destination_folder, os.path.basename(obj.name))
        # Check if the file exists locally
        if overwrite or not os.path.exists(local_file_path):
            # Download the object to the local file
            obj.download_to_filename(local_file_path)
            print(f"Downloaded '{obj.name}' to '{local_file_path}'.")
            changes = True

    return changes


In [17]:
# Project path
project_root = os.path.dirname(os.getcwd())

# Lirectories path
local_image_folder = os.path.join(project_root, 'raw_data', 'image_data')
local_annot_folder = os.path.join(project_root, 'raw_data')

# Bucket paths
bucket_image_folder = 'image_data/'  # Destination folder in the bucket
csv_name = 'bbox_train.csv'


In [37]:
print('Updating local raw data from Google Cloud Storage...')
changes_csv = download_images_from_GCP(bucket_name, csv_name, local_annot_folder)
changes_images = download_images_from_GCP(bucket_name, bucket_image_folder, local_image_folder)
if not(changes_csv or changes_images):
    print('Local raw data folder is up to date')


Updating local raw data from Google Cloud Storage...
Downloaded 'image_data/10041.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10041.jpg'.
Downloaded 'image_data/10042.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10042.jpg'.
Downloaded 'image_data/10043.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10043.jpg'.
Downloaded 'image_data/10045.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10045.jpg'.
Downloaded 'image_data/10046.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10046.jpg'.
Downloaded 'image_data/10049.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10049.jpg'.


KeyboardInterrupt: 