# Imports

In [1]:
import os
import pandas as pd
from google.cloud import storage
from face_tally.params import *


# Load raw data

In [2]:
# Project path
project_root = LOCAL_DATA_PATH

# Directories path
test_image_folder = os.path.join(LOCAL_DATA_PATH, "image_data")
path_annot = os.path.join(LOCAL_DATA_PATH, 'bbox_train.csv')


In [3]:
df = pd.read_csv(path_annot)
df.head()


Unnamed: 0,Name,width,height,xmin,ymin,xmax,ymax
0,10001.jpg,612,408,192,199,230,235
1,10001.jpg,612,408,247,168,291,211
2,10001.jpg,612,408,321,176,366,222
3,10001.jpg,612,408,355,183,387,214
4,10002.jpg,612,408,339,165,378,202


# Check which images have anotations. Only those ones are pushed to Google Cloud

In [5]:
annotated_images_names = df.Name.unique()
len(annotated_images_names)


5733

# Create bucket

In [6]:
!$BUCKET_NAME


zsh:1: command not found: facetally_data


In [7]:
!gsutil mb -l $GCP_REGION -p $GCP_PROJECT gs://$BUCKET_NAME


Creating gs://facetally_data/...


# Load data to Cloud Storage Bucket

In [8]:
# Create a Storage client
storage_client = storage.Client()

# Get the bucket
bucket = storage_client.bucket(BUCKET_NAME)

# Directories path
local_directory_images = os.path.join(LOCAL_DATA_PATH, "image_data")
local_directory_annotations = os.path.join(LOCAL_DATA_PATH)

# Destination directory in the bucket
destination_directory_images = 'image_data/'  # Destination folder in the bucket

# Filter images with annotations
raw_images_names = os.listdir(local_directory_images)
filtered_images_names = []
for image in raw_images_names:
    for annotation in annotated_images_names:
        if image == annotation:
            filtered_images_names.append(image)


In [9]:
# Upload the images
print(f"Uploading images")
for count, filename in enumerate(filtered_images_names):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        local_image_path = os.path.join(local_directory_images, filename)
        destination_blob_name = os.path.join(destination_directory_images, filename)

        # Create a Blob object and upload the image file to the bucket
        blob = bucket.blob(destination_blob_name)
        if blob.exists():
            print(f"The file '{destination_blob_name}' already exists, won't be pushed. Count: {count + 1}")
        else:
            blob.upload_from_filename(local_image_path)
            print(f"Image '{filename}' uploaded to Google Cloud Storage: {destination_blob_name}. Count: {count + 1}")
print(f"Process finished")


Uploading images
Image '15603.jpg' uploaded to Google Cloud Storage: image_data/15603.jpg. Count: 1
Image '13153.jpg' uploaded to Google Cloud Storage: image_data/13153.jpg. Count: 2
Image '12753.jpg' uploaded to Google Cloud Storage: image_data/12753.jpg. Count: 3
Image '15241.jpg' uploaded to Google Cloud Storage: image_data/15241.jpg. Count: 4
Image '10028.jpg' uploaded to Google Cloud Storage: image_data/10028.jpg. Count: 5
Image '16053.jpg' uploaded to Google Cloud Storage: image_data/16053.jpg. Count: 6
Image '17084.jpg' uploaded to Google Cloud Storage: image_data/17084.jpg. Count: 7
Image '16335.jpg' uploaded to Google Cloud Storage: image_data/16335.jpg. Count: 8
Image '16231.jpg' uploaded to Google Cloud Storage: image_data/16231.jpg. Count: 9
Image '14072.jpg' uploaded to Google Cloud Storage: image_data/14072.jpg. Count: 10
Image '16428.jpg' uploaded to Google Cloud Storage: image_data/16428.jpg. Count: 11
Image '17546.jpg' uploaded to Google Cloud Storage: image_data/17546

In [10]:
 # List objects in the specified folder path
folder_objects = list(bucket.list_blobs(prefix=destination_directory_images))

# Count the number of objects in the folder
num_objects = len(folder_objects)
print(f"Number of objects in '{destination_directory_images}': {num_objects}")


Number of objects in 'image_data/': 5733


In [11]:
# Upload the CSV file
csv_name = 'bbox_train.csv'
local_annotations_path = os.path.join(local_directory_annotations, csv_name)
blob = bucket.blob(csv_name)
blob.upload_from_filename(local_annotations_path)
print(f"File '{local_annotations_path}' uploaded to '{csv_name}'.")


File '/home/klingenm/.lewagon/facetally_data/bbox_train.csv' uploaded to 'bbox_train.csv'.


# Download data from BigQuery if it doesn't exist locally

In [12]:
def download_images_from_GCP(bucket_name, folder_path, destination_folder, overwrite = False):

    # Boolean to control if anything has been downloaded
    changes = False

    # Initialize the Cloud Storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # List objects in the specified folder path
    folder_objects = bucket.list_blobs(prefix=folder_path)

    # Create a local folder if it doesn't exist
    os.makedirs(destination_folder, exist_ok=True)

    # Download each image file from the folder
    for obj in folder_objects:
        # Construct local file path for downloading
        local_file_path = os.path.join(destination_folder, os.path.basename(obj.name))
        # Check if the file exists locally
        if overwrite or not os.path.exists(local_file_path):
            # Download the object to the local file
            obj.download_to_filename(local_file_path)
            print(f"Downloaded '{obj.name}' to '{local_file_path}'.")
            changes = True

    return changes


In [17]:
# Lirectories path
local_image_folder = os.path.join(LOCAL_DATA_PATH, 'raw_data', 'image_data')
local_annot_folder = os.path.join(LOCAL_DATA_PATH, 'raw_data')

# Bucket paths
bucket_image_folder = 'image_data/'  # Destination folder in the bucket
csv_name = 'bbox_train.csv'


In [37]:
print('Updating local raw data from Google Cloud Storage...')
changes_csv = download_images_from_GCP(BUCKET_NAME, csv_name, local_annot_folder)
changes_images = download_images_from_GCP(BUCKET_NAME, bucket_image_folder, local_image_folder)
if not(changes_csv or changes_images):
    print('Local raw data folder is up to date')


Updating local raw data from Google Cloud Storage...
Downloaded 'image_data/10041.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10041.jpg'.
Downloaded 'image_data/10042.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10042.jpg'.
Downloaded 'image_data/10043.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10043.jpg'.
Downloaded 'image_data/10045.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10045.jpg'.
Downloaded 'image_data/10046.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10046.jpg'.
Downloaded 'image_data/10049.jpg' to '/home/klingenm/code/KlingenbergMarc/facetally/raw_data/image_data/10049.jpg'.


KeyboardInterrupt: 