In [1]:
import pandas as pd
from tempfile import TemporaryDirectory
from pathlib import Path
import requests
import logging
import boto3
from botocore.exceptions import ClientError
import os

Constants

In [2]:
gedi_shot_location_url = 'https://www.cl.cam.ac.uk/research/eeg/4c/data/europe_coincident_shots_locations/'
gedi_shot_data_url = 'https://www.cl.cam.ac.uk/research/eeg/4c/data/europe_coincident_shots_data/'

s3_bucket = 'gfw2-data'
gedi_shot_location_s3_dir = 'climate/European_height_carbon_model/gedi_coincident_shot/data/shots/europe_coincident_shots_locations/'
gedi_shot_data_s3_dir = 'climate/European_height_carbon_model/gedi_coincident_shot/data/shots/europe_coincident_shots_data/'

Functions

In [3]:
def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

def upload_file(file_name, bucket, object_name=None):
    """This does not work with the newest version of boto3. You need to install conda-forge::boto3=1.34.33. Changes:
    ca-certificates    conda-forge::ca-certificates-2024.2.2~ --> pkgs/main::ca-certificates-2023.12.12-h06a4308_0
    certifi            conda-forge/noarch::certifi-2024.2.2-~ --> pkgs/main/linux-64::certifi-2024.2.2-py38h06a4308_0
    urllib3            conda-forge/noarch::urllib3-2.2.0-pyh~ --> pkgs/main/linux-64::urllib3-1.26.18-py38h06a4308_0
    Documentation for this known issue found here: https://github.com/conda-forge/awscli-feedstock/issues/828
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

Get Download URLs

In [6]:
#Get urls for all gedi shot locations
location_table = pd.read_html(gedi_shot_location_url)[0]
location_urls = location_table.Name[3:1598].to_list() #Skipping parent dir, success, and na rows

#Get urls for all gedi shot data
data_table = pd.read_html(gedi_shot_data_url)[0]
#data_urls = data_table.Name[3:203] #Skipping parent dir, success, and na rows
data_urls = data_table.Name[131:203] #Skipping parent dir, success, and na rows

#Get s3 client 
s3 = boto3.client('s3')

Download GEDI Shot Locations

In [5]:
#Create temp dir, download gedi shot location files, and upload to s3
with TemporaryDirectory() as location_temp_dir:
    for filename in location_urls:
        file_url = f"{gedi_shot_location_url}{filename}"
        local_path = str(Path(location_temp_dir)/filename)
        s3_path = str(Path(gedi_shot_location_s3_dir)/filename)
        download_file(file_url, local_path)
        s3.upload_file(local_path, s3_bucket, s3_path)

    print(len(list(Path(location_temp_dir).glob('*.parquet'))))

1595


Download GEDI Shot Data

In [7]:
#Create temp dir, download gedi shot data files, and upload to s3
with TemporaryDirectory() as data_temp_dir:
    for filename in data_urls:
        file_url = f"{gedi_shot_data_url}{filename}"
        local_path = str(Path(data_temp_dir)/filename)
        s3_path = str(Path(gedi_shot_data_s3_dir)/filename)
        download_file(file_url, local_path)
        s3.upload_file(local_path, s3_bucket, s3_path)

    print(len(list(Path(data_temp_dir).glob('*.parquet'))))

72
