### AWS Tutorial 2

This tutorial will show you how to download the entire data record for the blended TROPOMI+GOSAT atmospheric methane satellite data product.

Nicholas Balasus\
8 February 2024

In [1]:
import boto3
import multiprocessing
import os

In [2]:
# First, set up access to S3 using our credentials.
bucket_name = "blended-tropomi-gosat-methane"
s3 = None
def initialize():
    global s3
    s3 = boto3.client('s3')
initialize()

In [3]:
# Loop through the folders for each months and collect all S3 paths
months = ([f"2018-{str(m).zfill(2)}" for m in range(4,13)] +
          [f"{y}-{str(m).zfill(2)}" for m in range(1,13) for y in range(2019,2024)])

s3_paths = []
for month in months:
    Prefix=(f"data/{month}/")
    for key in s3.list_objects(Bucket=bucket_name, Prefix=Prefix)["Contents"]:
        s3_paths.append(key["Key"])
print(f"Going to download {len(s3_paths)} files.")

Going to download 29258 files.


In [4]:
# Download the files using multiple cores
storage_dir = "/n/holyscratch01/jacob_lab/nbalasus/test"
os.makedirs(storage_dir, exist_ok=True)

def download_from_s3(s3_path):
    file =  os.path.basename(s3_path)
    local_file_path = os.path.join(storage_dir,file)
    s3.download_file(bucket_name, s3_path, local_file_path)

with multiprocessing.Pool(112, initialize) as pool:
    pool.map(download_from_s3, s3_paths)
    pool.close()
    pool.join()