# Use GCP bucket to host images of cats

## install libs

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m460.8/547.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-a

In [None]:
# Install Google Cloud SDK
!pip install -q google-cloud-storage
# !apt-get -q install google-cloud-sdk

Reading package lists...
Building dependency tree...
Reading state information...
E: Unable to locate package google-cloud-sdk


## Authent step

In [None]:
# Authenticate
from google.colab import auth
auth.authenticate_user()

In [None]:
# Initialize the Google Cloud SDK
!gcloud init


Welcome! This command will take you through the configuration of gcloud.

Settings from your current configuration [default] are:
component_manager:
  disable_update_check: 'True'
core:
  account: Louispaulet13@gmail.com

Pick configuration to use:
 [1] Re-initialize this configuration [default] with new settings 
 [2] Create a new configuration
Please enter your numeric choice:  1

Your current configuration has been set to: [default]

You can skip diagnostics next time by using the following flag:
  gcloud init --skip-diagnostics

Network diagnostic detects and fixes local network connection issues.
Reachability Check passed.
Network diagnostic passed (1/1 checks passed).

Choose the account you would like to use to perform operations for this 
configuration:
 [1] Louispaulet13@gmail.com
 [2] Log in with a new account
Please enter your numeric choice:  1

You are logged in as: [Louispaulet13@gmail.com].

Pick cloud project to use: 
 [1] gen-lang-client-0077006347
 [2] hatvp-backup
 [

## Simple text file upload test

In [None]:
! echo "this is a test file" > test_file.txt

In [None]:
from google.cloud import storage

def upload_to_bucket(blob_name, file_path, bucket_name):
    """ Upload data to a bucket"""
    # Explicitly use service account credentials by specifying the private key file.
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(file_path)
    print(f"File {file_path} uploaded to {blob_name}.")

# Example usage
file_path = "/content/test_file.txt"  # Replace with the path to your file
blob_name = "test_file.txt"  # The name of the file in GCP bucket

upload_to_bucket(blob_name, file_path, "pretty_cats")


File /content/test_file.txt uploaded to test_file.txt.


## Get the cat pictures in a local dir

In [None]:
from datasets import load_dataset

cats_ds = load_dataset('the-french-artist/unique_9k_cats', split='train')
cats_ds

Downloading readme:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/528M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/504M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/431M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/506M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9936 [00:00<?, ? examples/s]

Dataset({
    features: ['image', 'label'],
    num_rows: 9936
})

In [None]:
cats_df = cats_ds.to_pandas()

In [None]:
import os
import pandas as pd
from PIL import Image
import io
from tqdm.auto import tqdm

def save_images_from_dataframe(df, folder_name):
    # Create the directory if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Iterate over the rows in the dataframe
    for index, row in tqdm(df.iterrows()):
        # Get the binary data
        image_data = row['image']

        # Convert binary data to an image
        image = Image.open(io.BytesIO(image_data['bytes']))

        # Define the filename
        filename = os.path.join(folder_name, f'image_{index}.jpg')

        # Save the image
        image.save(filename)

# Example usage
# Assuming your dataframe is loaded into the variable cats_df
save_images_from_dataframe(cats_df, 'cat_export')


0it [00:00, ?it/s]

## Upload local cat pics to remote bucket

### load list of file paths

In [None]:
from glob import glob

glob_path = glob('cat_export/*.jpg')
glob_path[:5]

['cat_export/image_1260.jpg',
 'cat_export/image_2434.jpg',
 'cat_export/image_8774.jpg',
 'cat_export/image_6774.jpg',
 'cat_export/image_2702.jpg']

### create list of filenames

In [None]:
filename_list = []
for filepath in glob_path:
  filename_list.append(filepath.split('/')[-1])
filename_list[:5]

['image_1260.jpg',
 'image_2434.jpg',
 'image_8774.jpg',
 'image_6774.jpg',
 'image_2702.jpg']

In [None]:
for filepath, filename in tqdm(zip(glob_path, filename_list)):
  upload_to_bucket(filename, filepath, "pretty_cats")

0it [00:00, ?it/s]

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
File cat_export/image_4902.jpg uploaded to image_4902.jpg.
File cat_export/image_3663.jpg uploaded to image_3663.jpg.
File cat_export/image_3813.jpg uploaded to image_3813.jpg.
File cat_export/image_699.jpg uploaded to image_699.jpg.
File cat_export/image_416.jpg uploaded to image_416.jpg.
File cat_export/image_5994.jpg uploaded to image_5994.jpg.
File cat_export/image_2436.jpg uploaded to image_2436.jpg.
File cat_export/image_8512.jpg uploaded to image_8512.jpg.
File cat_export/image_5381.jpg uploaded to image_5381.jpg.
File cat_export/image_7402.jpg uploaded to image_7402.jpg.
File cat_export/image_5770.jpg uploaded to image_5770.jpg.
File cat_export/image_7289.jpg uploaded to image_7289.jpg.
File cat_export/image_3690.jpg uploaded to image_3690.jpg.
File cat_export/image_6048.jpg uploaded to image_6048.jpg.
File cat_export/image_5514.jpg uploaded to image_5514.jpg.
File cat_export/image_2971.