In [1]:
import pandas as pd
import requests
from prefect import flow, task
from google.cloud import storage


In [2]:
@task(retries=3)
def fetch_folders(base_url: str) -> list:
    """Get list of zip folders containing NOAA data"""

    html = requests.get(base_url).content
    df_list = pd.read_html(html)
    df = df_list[-1]
    lszip = df[df.Name.str.endswith(".zip", na=False)]["Name"].tolist()

    return lszip

In [3]:
@task()
def write_gcs(base_url: str, folder: str, bucket) -> None:
    """Upload local parquet file to GCS"""

    folder_url = f"{base_url[:-1]}/{folder}"
    r = requests.get(folder_url)
    blob = bucket.blob(f"zip/{folder}")
    blob.upload_from_string(r.content)

In [4]:
@flow(log_prints=True)
def noaa_to_gcs() -> None:
    """Get zip folders from noaa site and save to gcs bucket"""
    # Set Google Cloud Storage credentials
    storage_client = storage.Client.from_service_account_json("../creds.json")
    bucket_name = "de_project_bucket"
    bucket = storage_client.get_bucket(bucket_name)

    base_url = "https://www.st.nmfs.noaa.gov/st1/recreational/MRIP_Survey_Data/CSV/"
    lszip = fetch_folders(base_url)

    for folder in lszip:
        write_gcs(base_url, folder, bucket)

In [5]:
# if __name__ == '__main__':
noaa_to_gcs()


[Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `list`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Un