# Unzipping Data on GCP

Since the CTX data (http://murray-lab.caltech.edu/CTX/V01/tiles/) is in .zip form, we first ingest the zip files on GCP, and then unzip them to extract the files.

In [None]:
from google.cloud import storage
from zipfile import ZipFile
from zipfile import is_zipfile
import io

# 1. Unzipping the CTX data (sample of 2)

http://murray-lab.caltech.edu/CTX/V01/tiles/

In [None]:
# Adapted from: https://stackoverflow.com/questions/49541026/how-do-i-unzip-a-zip-file-in-google-cloud-storage
def zipextract_ctx(gs_zip_path, gs_unzip_folder):
    """
    Unzips the CTX file from GCP, and puts the zipped .tif file
    in the folder specified.
    Please note: this function ignores everything that isn't a .tif file.
    
    """
    # Both paths must be in same bucket
    
    bucketname = gs_zip_path.split("/")[2]
    zipfilename_with_path = gs_zip_path.split(f"{bucketname}/")[1]
    unzipfilename_with_path = gs_unzip_folder.split(f"{bucketname}/")[1]

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucketname)

    destination_blob_pathname = zipfilename_with_path

    blob = bucket.blob(destination_blob_pathname)
    zipbytes = io.BytesIO(blob.download_as_string())

    if is_zipfile(zipbytes):
        with ZipFile(zipbytes, 'r') as myzip:
            for contentfilename in myzip.namelist():
                # Taking .tif only
                if contentfilename[-3:] == "tif":
                    #print(f"contentfilename: {contentfilename}")
                    contentfile = myzip.read(contentfilename)
                    #print(f"contentfile: {contentfile}")
                    contentfilename_save = contentfilename.split("/")[-1]
                    blob = bucket.blob(unzipfilename_with_path + "/" + contentfilename_save)
                    blob.upload_from_string(contentfile)

In [None]:
# Gets all zip files in folder
# https://stackoverflow.com/questions/22398898/google-cloud-storage-python-any-way-to-list-obj-in-certain-folder-in-gcs
gs_zip_paths = []
client = storage.Client()
for blob in client.list_blobs('esg-satelite-data-warehouse', prefix='mars/features/ctx_sample_2/raw/murray-lab.caltech.edu/CTX/V01/tiles'):
    gs_zip_path = f"gs://esg-satelite-data-warehouse/{str(blob).split(', ')[1]}"
    gs_zip_paths.append(gs_zip_path)

In [None]:
for gs_zip_path in gs_zip_paths:
    zipextract_ctx(
        gs_zip_path = gs_zip_path, 
        gs_unzip_folder = "gs://esg-satelite-data-warehouse/mars/features/ctx_sample_2/raw_unzipped"
    )

# 2. Unzipping the CTX data for Tempe Terra

In [None]:
# Gets all zip files in folder
# https://stackoverflow.com/questions/22398898/google-cloud-storage-python-any-way-to-list-obj-in-certain-folder-in-gcs
gs_zip_paths = []
client = storage.Client()
for blob in client.list_blobs('esg-satelite-data-warehouse', prefix='mars/features/ctx_tempe_terra/raw/murray-lab.caltech.edu/CTX/V01/tiles'):
    gs_zip_path = f"gs://esg-satelite-data-warehouse/{str(blob).split(', ')[1]}"
    gs_zip_paths.append(gs_zip_path)

In [None]:
i = 1
for gs_zip_path in gs_zip_paths:
    zipextract_ctx(
        gs_zip_path = gs_zip_path, 
        gs_unzip_folder = "gs://esg-satelite-data-warehouse/mars/features/ctx_tempe_terra/raw_unzipped_tif"
    )
    print(f"Successfully unzipped {i} of 66 files.")
    i+=1

Successfully unzipped 1 of 66 files.
Successfully unzipped 2 of 66 files.
Successfully unzipped 3 of 66 files.
Successfully unzipped 4 of 66 files.
Successfully unzipped 5 of 66 files.
Successfully unzipped 6 of 66 files.
Successfully unzipped 7 of 66 files.
Successfully unzipped 8 of 66 files.
Successfully unzipped 9 of 66 files.
Successfully unzipped 10 of 66 files.
Successfully unzipped 11 of 66 files.
Successfully unzipped 12 of 66 files.
Successfully unzipped 13 of 66 files.
Successfully unzipped 14 of 66 files.
Successfully unzipped 15 of 66 files.
Successfully unzipped 16 of 66 files.
Successfully unzipped 17 of 66 files.
Successfully unzipped 18 of 66 files.
Successfully unzipped 19 of 66 files.
Successfully unzipped 20 of 66 files.
Successfully unzipped 21 of 66 files.
Successfully unzipped 22 of 66 files.
Successfully unzipped 23 of 66 files.
Successfully unzipped 24 of 66 files.
Successfully unzipped 25 of 66 files.
Successfully unzipped 26 of 66 files.
Successfully unzipped