### Get list of NetCDF files in dataset directory on NOAA web site

In [1]:
import ipywidgets as widgets
import os.path
import pandas as pd
import requests

### Define url of directory at NOAA containing the data files to download

In [2]:
dataset_url = 'http://www1.ncdc.noaa.gov/pub/data/paleo/treering/reconstructions/northamerica/usa/bocinsky2016/'
default_destination_dir = '/projects/skope/original/paleocar/'

### Get the names of files corresponding to each of the two data set variables

In [3]:
# load first html table at url into a dataframe
table = pd.read_html(dataset_url)[0]

# discard table rows with nulls in column 1
valid_rows = table.dropna(subset=[1])

# get column of file names
file_names = valid_rows[1]

# get names of files corresponding to the GDD and PPT variables
gdd_file_names = file_names[file_names.str.contains('GDD.nc4')]
ppt_file_names = file_names[file_names.str.contains('PPT.nc4')]

gdd_total_file_count = len(gdd_file_names)
ppt_total_file_count = len(ppt_file_names)

# make sure there are 155 files for each variable
assert gdd_total_file_count == 155
assert ppt_total_file_count == 155

### Specify the number of files to download for each variable

In [4]:
download_to_directory = widgets.Text(
    description = "Download to", value=default_destination_dir)

gdd_selected_file_count = widgets.IntSlider(
    description = "GDD files", 
    min         = 1, 
    max         = gdd_total_file_count, 
    value       = gdd_total_file_count
)

ppt_selected_file_count = widgets.IntSlider(
    description = "PPT files", 
    min         = 1, 
    max         = ppt_total_file_count, 
    value       = ppt_total_file_count
)

download_status = widgets.Button(
    description = 'Download not started',
    disabled    = True
)

display(
    download_to_directory,
    gdd_selected_file_count, 
    ppt_selected_file_count, 
    download_status
)

Text(value='/projects/skope/original/paleocar/', description='Download to')

IntSlider(value=155, description='GDD files', max=155, min=1)

IntSlider(value=155, description='PPT files', max=155, min=1)

Button(description='Download not started', disabled=True, style=ButtonStyle())

### Download the selected number of files for each variable of the dataset

In [5]:
destination_dir = download_to_directory.value

def download_file(source_url, destination_path):
    response = requests.get(source_url)
    assert response.status_code == 200
    with open(destination_path, 'wb') as f:  
        f.write(response.content)

def download_file_if_needed(base_url, file_name, destination_dir):
    source_url = base_url + file_name
    destination_path = destination_dir + '/' + file_name
    if os.path.exists(destination_path):
        print(file_name, "already downloaded")
    else:
        download_file(source_url, destination_path)
        print(file_name, "finished downloading")

def download_files(file_list, destination_dir):
    for i, file_name in enumerate(file_list):
        download_file_if_needed(dataset_url, file_name, destination_dir)
    print("\nTotal of %d files downloaded\n" % (i+1))

download_status.description = 'Downloading files'
download_files(gdd_file_names[:gdd_selected_file_count.value], destination_dir)
download_files(ppt_file_names[:ppt_selected_file_count.value], destination_dir)
download_status.description = 'Downloads complete'


103W31N_GDD.nc4 already downloaded
103W32N_GDD.nc4 already downloaded
103W33N_GDD.nc4 already downloaded
103W34N_GDD.nc4 already downloaded
103W35N_GDD.nc4 already downloaded
103W36N_GDD.nc4 already downloaded
103W37N_GDD.nc4 already downloaded
103W38N_GDD.nc4 already downloaded
103W39N_GDD.nc4 already downloaded
103W40N_GDD.nc4 already downloaded
103W41N_GDD.nc4 already downloaded
103W42N_GDD.nc4 already downloaded
104W31N_GDD.nc4 already downloaded
104W32N_GDD.nc4 already downloaded
104W33N_GDD.nc4 already downloaded
104W34N_GDD.nc4 already downloaded
104W35N_GDD.nc4 already downloaded
104W36N_GDD.nc4 already downloaded
104W37N_GDD.nc4 already downloaded
104W38N_GDD.nc4 already downloaded
104W39N_GDD.nc4 already downloaded
104W40N_GDD.nc4 already downloaded
104W41N_GDD.nc4 already downloaded
104W42N_GDD.nc4 already downloaded
105W31N_GDD.nc4 already downloaded
105W32N_GDD.nc4 already downloaded
105W33N_GDD.nc4 already downloaded
105W34N_GDD.nc4 already downloaded
105W35N_GDD.nc4 alre