In [1]:
import os
import pandas as pd
import glob
import requests

# Download the stormevents databases
BRONZE_PATH = '/usr/datalake/bronze/stormevents'
SILVER_PATH = '/usr/datalake/silver/stormevents'
NCEI_URL = 'https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles'

CSV_FILENAME = f'{SILVER_PATH}/docs/csvfiles.csv'

In [2]:
os.makedirs(f'{BRONZE_PATH}/csvfiles', exist_ok=True)

In [3]:
index = pd.read_csv(CSV_FILENAME)
index.head()

Unnamed: 0,filename,year,updated
0,StormEvents_details-ftp_v1.0_d1950_c20250520.c...,1950,2025-05-20
1,StormEvents_details-ftp_v1.0_d1951_c20250520.c...,1951,2025-05-20
2,StormEvents_details-ftp_v1.0_d1952_c20250520.c...,1952,2025-05-20
3,StormEvents_details-ftp_v1.0_d1953_c20250520.c...,1953,2025-05-20
4,StormEvents_details-ftp_v1.0_d1954_c20250520.c...,1954,2025-05-20


In [4]:
for _, csvfile in index.iterrows():
    filename = csvfile['filename']
    if csvfile['year'] < 1990:
        print(f'Skipping {filename}. Year earlier than 1990.')
        continue
    
    filename_local = f'{BRONZE_PATH}/csvfiles/{filename}'
    if os.path.exists(filename_local):
        print(f'Skipping {filename}. File already exists.')
        continue

    pattern = f'{BRONZE_PATH}/csvfiles/{filename[:36]}*.csv.gz'
    if len(glob.glob(pattern)):
        print(f'WARNING: Found older version of {filename}. Delete the older version.')
        continue

    url = f'{NCEI_URL}/{filename}'
    response = requests.get(url)
    response.raise_for_status()

    with open(filename_local, 'wb') as f:
        f.write(response.content)

    print(f"Downloaded {filename}.")


Skipping StormEvents_details-ftp_v1.0_d1950_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1951_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1952_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1953_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1954_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1955_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1956_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1957_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1958_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1959_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1960_c20250520.csv.gz. Year earlier than 1990.
Skipping StormEvents_details-ftp_v1.0_d1961_c20250520.