In [None]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from pathlib import Path
import pandas as pd
import requests
from prefect import flow, task
from prefect_gcp.cloud_storage import GcsBucket
from google.cloud import storage

In [None]:
@task(retries=3)
def fetch_folders(base_url: str) -> list:
    """Get list of zip folders containing NOAA data"""

    html = requests.get(base_url).content
    df_list = pd.read_html(html)
    df = df_list[-1]
    lszip = df[df.Name.str.endswith(".zip", na=False)]['Name'].tolist()
    
    return lszip

In [None]:
@task()
def write_gcs(base_url: str, folder: str) -> None:
    """Upload local parquet file to GCS"""
    # gcp_cloud_storage_bucket_block = GcsBucket.load("noaa-gcs")
    # gcp_cloud_storage_bucket_block.upload_from_folder(folder)

    # Set Google Cloud Storage credentials
    storage_client = storage.Client.from_service_account_json('../creds.json')
    bucket_name = 'de_project_bucket'
    bucket = storage_client.get_bucket(bucket_name)

    folder_url = f'{base_url[:-1]}/{folder}'
    r = requests.get(folder_url)
    blob = bucket.blob(f'raw/{folder}')
    blob.upload_from_string(r.content)

In [None]:
@flow(log_prints=True)
def noaa_to_gcs() -> None:
    """Get zip folders from noaa site and save to gcs bucket"""    
    base_url = 'https://www.st.nmfs.noaa.gov/st1/recreational/MRIP_Survey_Data/CSV/'
    lszip = fetch_folders(base_url)

    for folder in lszip:
        write_gcs(base_url, folder)

In [None]:
# if __name__ == '__main__':
noaa_to_gcs()