# Unzipping Data on GCP

Since the CTX data (http://murray-lab.caltech.edu/CTX/V01/tiles/) is in .zip form, we first ingest the zip files on GCP, and then unzip them to extract the files.

In [23]:
from google.cloud import storage
from zipfile import ZipFile
from zipfile import is_zipfile
import io

# 1. Unzipping the CTX data (sample of 2)

http://murray-lab.caltech.edu/CTX/V01/tiles/

In [24]:
# Adapted from: https://stackoverflow.com/questions/49541026/how-do-i-unzip-a-zip-file-in-google-cloud-storage
def zipextract_ctx(gs_zip_path, gs_unzip_folder):
    """
    Unzips the CTX file from GCP, and puts the zipped .tif file
    in the folder specified.
    Please note: this function ignores everything that isn't a .tif file.
    
    """
    # Both paths must be in same bucket
    
    bucketname = gs_zip_path.split("/")[2]
    zipfilename_with_path = gs_zip_path.split(f"{bucketname}/")[1]
    unzipfilename_with_path = gs_unzip_folder.split(f"{bucketname}/")[1]

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucketname)

    destination_blob_pathname = zipfilename_with_path

    blob = bucket.blob(destination_blob_pathname)
    zipbytes = io.BytesIO(blob.download_as_string())

    if is_zipfile(zipbytes):
        with ZipFile(zipbytes, 'r') as myzip:
            for contentfilename in myzip.namelist():
                # Taking .tif only
                if contentfilename[-3:] == "tif":
                    #print(f"contentfilename: {contentfilename}")
                    contentfile = myzip.read(contentfilename)
                    #print(f"contentfile: {contentfile}")
                    contentfilename_save = contentfilename.split("/")[-1]
                    blob = bucket.blob(unzipfilename_with_path + "/" + contentfilename_save)
                    blob.upload_from_string(contentfile)

In [39]:
# Gets all zip files in folder
# https://stackoverflow.com/questions/22398898/google-cloud-storage-python-any-way-to-list-obj-in-certain-folder-in-gcs
gs_zip_paths = []
client = storage.Client()
for blob in client.list_blobs('esg-satelite-data-warehouse', prefix='mars/features/ctx_sample_2/raw/murray-lab.caltech.edu/CTX/V01/tiles'):
    gs_zip_path = f"gs://esg-satelite-data-warehouse/{str(blob).split(', ')[1]}"
    gs_zip_paths.append(gs_zip_path)

In [41]:
for gs_zip_path in gs_zip_paths:
    zipextract(
        gs_zip_path = gs_zip_path, 
        gs_unzip_folder = "gs://esg-satelite-data-warehouse/mars/features/ctx_sample_2/raw_unzipped"
    )