# Download Storm Events Data
Author: Mark Bauer

In [1]:
import os
import ibis
import urllib.request
import requests
from bs4 import BeautifulSoup
import duckdb

In [11]:
%reload_ext watermark
%watermark -v -p urllib.request,requests,bs4,duckdb,ibis

Python implementation: CPython
Python version       : 3.8.13
IPython version      : 8.4.0

urllib.request: 3.8
requests      : 2.28.1
bs4           : 4.11.1
duckdb        : 0.10.0
ibis          : 3.2.0



Data: **Storm Events Database**  
Source: https://www.ncdc.noaa.gov/stormevents/

Description:
> The Storm Events Database contains the records used to create the official [NOAA Storm Data publication](https://www.ncdc.noaa.gov/IPS/sd/sd.html), documenting:
>
>a. The occurrence of storms and other significant weather phenomena having sufficient intensity to cause loss of life, injuries, significant property damage, and/or disruption to commerce;
>
>b. Rare, unusual, weather phenomena that generate media attention, such as snow flurries in South Florida or the San Diego coastal area; and
>
>c. Other significant meteorological events, such as record maximum or minimum temperatures or precipitation that occur in connection with another event.
>
>The database currently contains data from **January 1950 to February 2024**, as entered by NOAA's National Weather Service (NWS). Due to changes in the data collection and processing procedures over time, there are unique periods of record available depending on the event type. NCEI has performed data reformatting and standardization of event types but has not changed any data values for locations, fatalities, injuries, damage, narratives and any other event specific information. Please refer to the [Database Details](https://www.ncdc.noaa.gov/stormevents/details.jsp) page for more information.
    

In [3]:
# download data dictionary
url = 'https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/Storm-Data-Bulk-csv-Format.pdf'
filepath = 'Storm-Data-Bulk-csv-Format.pdf'

urllib.request.urlretrieve(url, filepath)

# list files in data folder
%ls 

Storm-Data-Bulk-csv-Format.pdf  ibis-basics.ipynb
[34mdata[m[m/                           storm-events.db
download-data.ipynb             storm-events.db.wal
filter-data.ipynb


In [4]:
# retrieve links on webpage
r  = requests.get("https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/")
data = r.text
soup = BeautifulSoup(data)

links = []
for link in soup.find_all('a'):
    href = link.get('href')
    links.append(href)
    
# preview links    
links[:10]    

['?C=N;O=D',
 '?C=M;O=A',
 '?C=S;O=A',
 '?C=D;O=A',
 '/pub/data/swdi/stormevents/',
 'Storm-Data-Bulk-csv-Format.pdf',
 'Storm-Data-Export-Format.pdf',
 'StormEvents_details-ftp_v1.0_d1950_c20210803.csv.gz',
 'StormEvents_details-ftp_v1.0_d1951_c20210803.csv.gz',
 'StormEvents_details-ftp_v1.0_d1952_c20210803.csv.gz']

In [5]:
# retrieve only StormEvents_details files
details = "StormEvents_details"
links_details = [i for i in links if details in i]

links_details[:5]

['StormEvents_details-ftp_v1.0_d1950_c20210803.csv.gz',
 'StormEvents_details-ftp_v1.0_d1951_c20210803.csv.gz',
 'StormEvents_details-ftp_v1.0_d1952_c20210803.csv.gz',
 'StormEvents_details-ftp_v1.0_d1953_c20210803.csv.gz',
 'StormEvents_details-ftp_v1.0_d1954_c20210803.csv.gz']

In [6]:
# download files
path = "https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/"

for link in links_details:
    path_full = os.path.join(path, link)
    response = requests.get(path_full)

    file_name = os.path.join('data/raw/', link)
    with open(file_name, 'wb') as file:
        file.write(response.content)

In [7]:
# count of files
!ls -l data/raw/ | grep "^-" | wc -l

      75


In [8]:
# file size
!du -sh data/raw/

273M	data/raw/


In [9]:
# preview files in directory
!ls data/raw/ | head -n 5

StormEvents_details-ftp_v1.0_d1950_c20210803.csv.gz
StormEvents_details-ftp_v1.0_d1951_c20210803.csv.gz
StormEvents_details-ftp_v1.0_d1952_c20210803.csv.gz
StormEvents_details-ftp_v1.0_d1953_c20210803.csv.gz
StormEvents_details-ftp_v1.0_d1954_c20210803.csv.gz
