# Python code to download the NOAA ERSST version 5 monthly SST dataset, uses `requests`

In [1]:
%matplotlib inline

In [2]:
import sys

In [3]:
print(sys.executable)

/home/nicolasf/anaconda3/envs/climlab/bin/python


In [4]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [5]:
import pathlib
import requests

### a small function that returns a list of files given an URL (http) and an extension, uses requests 

In [6]:
def get_url_paths(url, ext='', params={}):
    import requests 
    from bs4 import BeautifulSoup
    response = requests.get(url, params=params)
    if response.ok:
        response_text = response.text
    else:
        return response.raise_for_status()
    soup = BeautifulSoup(response_text, 'html.parser')
    parent = [url + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]
    return parent

### set the local path where the data is / will be downloaded 

In [7]:
local_path = pathlib.Path('./data')

In [8]:
if not local_path.exists(): 
    local_path.mkdir(parents=True)

### set the URL and the location of the datasets on the NOAA server 

In [9]:
base_url = 'https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/'

### first get the list of netcdf files currently available 

In [10]:
ext = 'nc'

In [11]:
remote_files = get_url_paths(base_url, ext)

In [12]:
remote_files[0]

'https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.185401.nc'

In [13]:
remote_files[-1]

'https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202003.nc'

In [14]:
remote_files = [f.split('/')[-1] for f in remote_files]

### check that the most recent file is not completely outdated

In [15]:
last_avail = datetime.strptime(f"{remote_files[-1].split('.')[-2]}01", "%Y%m%d")

In [16]:
lag = (datetime.utcnow() - last_avail).days

In [17]:
if lag >= 70: 
    print(f"Warning, the last available date on the NOAA server ({last_avail:%Y-%m}) is more than 70 days old ({lag} days ...)")

### Now get the list of local files 

In [18]:
local_files  = list(local_path.glob("ersst.*.nc"))

In [19]:
local_files  = [f.name for f in local_files]

### missing files list 

In [20]:
missing_files = list(set(remote_files) - set(local_files))

In [21]:
missing_files.sort()

In [22]:
missing_files[0]

'ersst.v5.185408.nc'

In [23]:
missing_files[-1]

'ersst.v5.202003.nc'

In [24]:
len(missing_files)

1988

### Now loop over the missing files, and download from the NOAA server 

In [25]:
for filename in missing_files: 
    print(f"now trying to download {filename} from {base_url}")
    r = requests.get(f"{base_url}/{filename}") 
    if r.status_code == 200:
        with open(local_path.joinpath(filename), 'wb') as f:
            f.write(r.content) 
        if local_path.joinpath(filename).exists():
            print(f"{filename} successfully saved in {str(local_path)}")
        else: 
            print(f"{filename} was available remotely but could not be saved locally in {str(local_path)}")
    else: 
        print(f"unable to access {filename} at {base_url}")

now trying to download ersst.v5.185408.nc from https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/
ersst.v5.185408.nc successfully saved in data
now trying to download ersst.v5.185409.nc from https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/
ersst.v5.185409.nc successfully saved in data
now trying to download ersst.v5.185410.nc from https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/
ersst.v5.185410.nc successfully saved in data
now trying to download ersst.v5.185411.nc from https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/
ersst.v5.185411.nc successfully saved in data
now trying to download ersst.v5.185412.nc from https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/
ersst.v5.185412.nc successfully saved in data
now trying to download ersst.v5.185501.nc from https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netcdf/
ersst.v5.185501.nc successfully saved in data
now trying to download ersst.v5.185502.nc from https://www1.ncdc.noaa.gov/pub/data/cmb/ersst/v5/netc