In [1]:
import urllib.request
import pandas as pd
import requests
from bs4 import BeautifulSoup
import gzip
import os
from time import sleep

# CSV COLUMN STRUCTURE
https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/Storm-Data-Bulk-csv-Format.pdf

In [2]:
# first, we need to get all of the available files 

url = 'https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/'
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')

# now we need to extract the file names from the parent directory
links = []
# this will grab all of the a href elements' links 
for link in [link['href'] for link in soup.find('table').find_all('a')]:
    # then check that they're a details file (the ones we actually want)
    if link[:29] == 'StormEvents_details-ftp_v1.0_':
        links.append(link)

In [None]:
print("\n\nStarting to fetch files. This can take a couple minutes.")
out_df = None
holding = 'data'
# this needs a folder called 'data' on your local drive, so it will create one if it's not already there
if not os.path.exists(holding):
    os.makedirs(holding)
total_files = len(links)

# A lot happens here, but we start by iterating over all the files we've identified
for idx,file_path in enumerate(links):
    local_file_path = holding + '/' + file_path
    # we build the target url with the file_path we extracted earlier
    target = url + file_path
    # we go get the file in question
    urllib.request.urlretrieve(target, local_file_path)
    # it's a gzip'd file, so we need to unzip it. 
    with gzip.open(local_file_path, 'rb') as f:
        # create a df from the csv we pulled down 
        if out_df is None:
            out_df = pd.read_csv(f)
        else:
            out_df = pd.concat([out_df,pd.read_csv(f)])
    # be kind, don't pummel your free data sources with requests 
    sleep(1)
    if idx % 5 == 0:
        print(f"{idx}/{total_files} complete")
out_df = out_df.reset_index()
print('\n','='*(100),'\nFetching complete! Reducing master file\n','='*(100),'\n')

out_df[out_df['EVENT_TYPE'].isin(['Tornado','TORNADOES'])][
    [
    'BEGIN_YEARMONTH',      # YYYYMM of event start
    'BEGIN_DAY',            # DD of event start
    'END_YEARMONTH',        # YYYYMM of event end
    'END_DAY',              # DD of event end
    'EPISODE_ID',           # NWS ID for storm
    'EVENT_ID',             # NWS ID for event (different from episode, which is mostly for storm narratives)
    'STATE',                # SPELLED OUT ALL CAPS state name
    'STATE_FIPS',           # Federal Information Processing Standard ID of the state (good for joining to other data)
    'EVENT_TYPE',           # may consider other options here for what the project is attempting to do 
    'CZ_TYPE',              # C: County/Parish, Z: NWS Public Forecast Zone, M: Marine
    'CZ_FIPS',              # See above description of FIPS but for local region ID (like Adams County)
    'CZ_NAME',              # Name of the CZ
    'INJURIES_DIRECT',      # You can probably guess 
    'INJURIES_INDIRECT',    # Difference is annoying undefined
    'DEATHS_DIRECT',        # You can probably guess, but sadder
    'DEATHS_INDIRECT',      # Same idea
    'DAMAGE_PROPERTY',      # Estimate damage to property, suffix units are all jumbled
    'TOR_F_SCALE',          # Enhanced Fujita Scale of the tornado 
    'TOR_LENGTH',           # Length of tornado segment on the ground (in miles, nearest tenth)
    'TOR_WIDTH',            # Width of tornado on the ground (feet)
    'BEGIN_LAT',            # Latitude of beginning of damage path
    'BEGIN_LON',            # Longitude of beginning of damage path
    'EPISODE_NARRATIVE',    # NWS' narrative of the storm that created the tornado
    'EVENT_NARRATIVE'       # NWS' narrative of the tornado itself
    ]
].to_csv('StormEvents_details_WORKING.csv')
print('\n','='*(100),'\nWorking file created! Creating Master copy.\n','='*(100),'\n')
out_df.to_csv('StormEvents_details_MASTER.csv')
print('\n','='*(100),'\nMaster copy created! Enjoy your data!\n','='*(100),'\n')
out_df = None 
