# Common functions and models

This notebook contains functions and models that are used in multiple other sea level rise notebooks. To avoid repeating these functions in all notebooks, they are defined here.

For an example of how to run this notebook, see: extended-data-sources.ipynb.

Currently the functions present in this notebook are:
1. A set of functions that together retrieve tide gauge records of the sea level.
2. The linear statistical model to fit through a measured sea level series.

In [1]:
# this is a list of packages that are used in this notebook
# these come with python
import io
import zipfile
import functools
import bisect
import datetime
import re

# you can install these packages using pip or anaconda
# (requests numpy pandas bokeh pyproj statsmodels)

# for downloading
import requests
import netCDF4

# computation libraries
import numpy as np
import pandas

# statistics
import statsmodels.api as sm

We first define a number of variables (global) with the location of content to download:

In [2]:
# Define the urls for the three PSMSL datasets
urls = {
    'met_monthly': 'http://www.psmsl.org/data/obtaining/met.monthly.data/met_monthly.zip',
    'rlr_monthly': 'http://www.psmsl.org/data/obtaining/rlr.monthly.data/rlr_monthly.zip',
    'rlr_annual': 'http://www.psmsl.org/data/obtaining/rlr.annual.data/rlr_annual.zip'
}

# each station has a number of files that you can look at.
# here we define a template for each filename
names = {
    'datum': '{dataset}/RLR_info/{id}.txt',
    'diagram': '{dataset}/RLR_info/{id}.png',
    'url': 'http://www.psmsl.org/data/obtaining/rlr.diagrams/{id}.php',
    'data': '{dataset}/data/{id}.{typetag}data',
    'doc': '{dataset}/docu/{id}.txt',
    'contact': '{dataset}/docu/{id}_auth.txt',
    'rlr_info': '{dataset}/RLR_info/{id}.txt',
}


The next function retrieves data from the NOAA Earth System Research Laboratory with which we create a dataset of the wind ad a given latitude and longitude. This data can be used for fitting the model.

In [3]:
def make_wind_df(lat_i=53, lon_i=3):
    """
    Create a dataset for wind, for 1 latitude/longitude
    
    Parameters
    ----------
    lat_i : int
        degree latitude
    lon_i : int
        degree longitude
    """
    u_file = 'http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface_gauss/uwnd.10m.mon.mean.nc'
    v_file = 'http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface_gauss/vwnd.10m.mon.mean.nc'

    # open the 2 files
    ds_u = netCDF4.Dataset(u_file)
    ds_v = netCDF4.Dataset(v_file)
    
    # read lat,lon, time from 1 dataset
    lat, lon, time = ds_u.variables['lat'][:], ds_u.variables['lon'][:], ds_u.variables['time'][:]
    
    # check with the others
    lat_v, lon_v, time_v = ds_v.variables['lat'][:], ds_v.variables['lon'][:], ds_v.variables['time'][:]
    assert (lat == lat_v).all() and (lon == lon_v).all() and (time == time_v).all()
    
    # convert to datetime
    t = netCDF4.num2date(time, ds_u.variables['time'].units)
    
    def find_closest(lat, lon, lat_i=lat_i, lon_i=lon_i):
        """lookup the index of the closest lat/lon"""
        Lon, Lat = np.meshgrid(lon, lat)
        idx = np.argmin(((Lat - lat_i)**2 + (Lon - lon_i)**2))
        Lat.ravel()[idx], Lon.ravel()[idx]
        [i, j] = np.unravel_index(idx, Lat.shape)
        return i, j
    
    # this is the index where we want our data
    i, j = find_closest(lat, lon)
    
    # get the u, v variables
    print('found point', lat[i], lon[j])
    u = ds_u.variables['uwnd'][:, i, j]
    v = ds_v.variables['vwnd'][:, i, j]
    
    # compute derived quantities
    speed = np.sqrt(u ** 2 + v **2)
    
    # compute direction in 0-2pi domain
    direction = np.mod(np.angle(u + v * 1j), 2*np.pi)
    
    # put everything in a dataframe
    wind_df = pandas.DataFrame(data=dict(u=u, v=v, t=t, speed=speed, direction=direction))
    wind_df = wind_df.set_index('t')

    # square wind
    wind_df['u2'] = wind_df['u']**2 * np.sign(wind_df['u'])
    wind_df['v2'] = wind_df['v']**2 * np.sign(wind_df['v'])
    
    # return it
    return wind_df


To find the Dutch stations in the metric data, we download the overview of the stations, and select all stations with coastline code 150, which indicates a Dutch station. Another coastline_code can also be used by specifying the keyword argument coastline_code.

In [4]:
def get_stations(zf, dataset_name, coastline_code=150, names=None):
    """
    Function to get a dataframe with the tide gauge stations within a dataset.
    The stations are filtered on a certain coastline_code, indicating a country.
    
    Parameters
    ----------
    zf : zipfile.ZipFile
        Downloaded zipfile
    dataset_name : string
        Name of the dataset that is used: met_monthly, rlr_monthly, rlr_annual
    coastline_code : int
        Coastline code indicating the country
    """
    # this list contains a table of 
    # station ID, latitude, longitude, station name, coastline code, station code, and quality flag
    csvtext = zf.read('{}/filelist.txt'.format(dataset_name))
    
    # Read the stations from the comma seperated text.
    stations = pandas.read_csv(
        io.BytesIO(csvtext), 
        sep=';',
        names=('id', 'lat', 'lon', 'name', 'coastline_code', 'station_code', 'quality'),
        converters={
            'name': str.strip,
            'quality': str.strip
        }
    )
    # Set index on column 'id'
    stations = stations.set_index('id')
    
    # filter on coastline code (Netherlands is 150)
    selected_stations = stations.where(stations['coastline_code'] == coastline_code).dropna(how='all')
    
    # Select on names
    if names is not None:
        lower_names = [name.lower() for name in names]
        indices = [(name.lower() in lower_names) for name in stations['name'].tolist()]
        selected_stations = stations.loc[indices]
    
    return selected_stations

In [5]:
def get_url(station, dataset):
    """return the url of the station information (diagram and datum)"""
    print(dataset, station.name, dataset.split('_')[0])
    info = dict(
        dataset=dataset,
        id=station.name,
        typetag=dataset.split('_')[0]
    )
    url = names['url'].format(**info)
    return url


In [6]:
def missing2nan(value, missing=-99999):
    """convert the value to nan if the float of value equals the missing value"""
    value = float(value)
    if value == missing:
        return np.nan
    return value


In [7]:
def year2date(year_fraction, dtype):
    """convert a year fraction to a datetime"""
    startpoints = np.linspace(0, 1, num=12, endpoint=False)
    remainder = np.mod(year_fraction, 1)
    year = np.floor_divide(year_fraction, 1).astype('int')
    month = np.searchsorted(startpoints, remainder)
    if (month == 0).all():
        # if month is set to 0 (for annual data), set to january
        month = np.ones_like(month)
    dates = [
        datetime.datetime(year_i, month_i, 1) 
        for year_i, month_i 
        in zip(year, month)
    ]
    datetime64s = np.asarray(dates, dtype=dtype)
    return datetime64s


In [8]:
def get_rlr2nap(zf, station, dataset):
    """
    Read rlr 2 nap correction from zipfile
    """
    info = dict(
        dataset=dataset,
        id=station.name,
    )
    
    bytes = zf.read(names['rlr_info'].format(**info))
    correction = float(re.findall('Add (.+) to data .+ onwards', bytes.decode())[-1].split()[-1].replace('m', '')) * 1000
    
    return lambda x: x - correction
    

In [9]:
def car2nau(carthesian):
    nautical = ((carthesian * -1) + 90) % 360
    return nautical


In [1]:
def get_data(zf, wind_df, station, dataset, alpha):
    """
    get data for the station (pandas record) from the dataset (url)
    
    Parameters
    ----------
    zf : zipfile.ZipFile
        Downloaded zipfile to get the data from
    wind_df : pandas.DataFrame
        Dataset with the wind for a given latitude and longitude
    station : pandas.Series
        Row of the selected_stations dataframe with station meta data
    dataset : string
        Name of the data set
    alpha : dictionary
        A dictionary with dominant wind directions for all coast stations.
        If no wind_df is passed, alpha can be None
    """
    # rlr or met
    typetag=dataset.split('_')[0]
    
    info = dict(
        dataset=dataset,
        id=station.name,
        typetag=typetag
    )
    bytes = zf.read(names['data'].format(**info))
    converters = {
            "interpolated": str.strip,
        }
    if typetag == 'rlr':
        rlr2nap = get_rlr2nap(zf, station, dataset)
        converters['height'] = lambda x: rlr2nap(missing2nan(x))
        
    df = pandas.read_csv(
        io.BytesIO(bytes), 
        sep=';', 
        names=('year', 'height', 'interpolated', 'flags'),
        converters=converters,
    )
    df['station'] = station.name
    df['t'] = year2date(df.year, dtype=wind_df.index.dtype if wind_df is not None else np.datetime64)
    df = df.set_index('t')
    
    if wind_df is None:
        # Return only the water levels
        return df
    
    else:
        
#         # Calculate angles in degrees from north (clockwise)
#         dir_car_deg = wind_df['direction'] / np.pi * 180
#         dir_nau_deg = car2nau(dir_car_deg)
        
#         # Calculate the wind speed components along and perpendicular to the coastline
#         alpha = alphas[station['name'].lower()]
#         wind_df['u2main'] = (wind_df['speed'] * np.cos(np.pi / 180 * (dir_nau_deg - alpha)))
#         wind_df['u2perp'] = (wind_df['speed'] * np.sin(np.pi / 180 * (dir_nau_deg - alpha)))
        
#         wind_df['u2main'] *= np.absolute(wind_df['u2main'])
#         wind_df['u2perp'] *= np.absolute(wind_df['u2perp'])
        
        # merge the wind and water levels
        if 'monthly' in dataset:
            merged = pandas.merge(df, wind_df, how='left', left_index=True, right_index=True)
        else:
            annual_wind_df = wind_df.resample('A', label='left', loffset=datetime.timedelta(days=1)).mean()
            merged = pandas.merge(df, annual_wind_df, how='left', left_index=True, right_index=True)

        merged['u2'] = np.where(np.isnan(merged['u']), np.nanmean(merged['u2']), merged['u2'])
        merged['v2'] = np.where(np.isnan(merged['v']), np.nanmean(merged['v2']), merged['v2'])
        
#         # the squared wind speed components along and perpendicular to the coastline
#         index = np.isnan(merged['u']) | np.isnan(merged['v'])
#         merged['u2main'] = np.where(index, np.nanmean(merged['u2main']), merged['u2main'])
#         merged['u2perp'] = np.where(index, np.nanmean(merged['u2perp']), merged['u2perp'])
        
        return merged

The next function uses all functions defined above to create a dataset with the tide gauge station data.

In [11]:
def get_station_data(dataset_name, coastline_code=150, names=None, include_wind=True, alphas=None):
    """MAIN FUNCTION. Method to get the station data for a certain dataset"""

    # download the zipfile
    resp = requests.get(urls[dataset_name])
    
    if include_wind:
        wind_df = make_wind_df()
    else:
        wind_df = None
      
    # we can read the zipfile
    stream = io.BytesIO(resp.content)
    zf = zipfile.ZipFile(stream)
    
    selected_stations = get_stations(zf, dataset_name=dataset_name, coastline_code=coastline_code, names=names)
    # fill in the dataset parameter using the global dataset_name
    f = functools.partial(get_url, dataset=dataset_name)
    # compute the url for each station
    selected_stations['url'] = selected_stations.apply(f, axis=1)
    
    selected_stations['data'] = [get_data(zf, wind_df, station, dataset=dataset_name, alpha=alphas)
                                 for _, station in selected_stations.iterrows()]
   
    return selected_stations


The linear regression model for sea levels within the sea level monitor is defined in the next code block. The model is fitted on the given dataset. Wind and seasonal variability can both be taken into account when fitting.

In [12]:
def linear_model_HAC(df, with_wind=True, with_season=True):
    """
    Return the fit from the linear model on the given dataset df.
    Wind and season can be enabled and disabled
    """
    y = df['height']
    X = np.c_[
        df['year']-1970, 
        np.cos(2*np.pi*(df['year']-1970)/18.613),
        np.sin(2*np.pi*(df['year']-1970)/18.613)
    ]
    month = np.mod(df['year'], 1) * 12.0
    names = ['Constant', 'Trend', 'Nodal U', 'Nodal V']
    if with_wind:
        X = np.c_[
            X,
            df['u2'],
            df['v2']
        ]
        names.extend(['Wind U^2', 'Wind V^2'])
    if with_season:
        for i in range(11):
            X = np.c_[
                X,
                np.logical_and(month >= i, month < i+1)
            ]
            names.append('month_%s' % (i+1, ))
    X = sm.add_constant(X)
    model = sm.OLS(y, X, missing='drop', covtype='HAC')
    #fit = model.fit()
    fit = model.fit(cov_type='HAC', cov_kwds={'maxlags':1}) # 1 lag (see ACF)
    return fit, names

In [None]:
# define the statistical model
    """
    Return the fit from the linear model on the given dataset df.
    Wind and season can be enabled and disabled
    """
def linear_model(df, with_wind=True, with_ar=True):
    y = df['height']
    X = np.c_[
        df['year']-1970, 
        np.cos(2*np.pi*(df['year']-1970)/18.613),
        np.sin(2*np.pi*(df['year']-1970)/18.613)
    ]
    month = np.mod(df['year'], 1) * 12.0
    names = ['Constant', 'Trend', 'Nodal U', 'Nodal V']
    if with_wind:
        X = np.c_[
            X, 
            df['u2'],
            df['v2']
        ]
        names.extend(['Wind $u^2$', 'Wind $v^2$'])
        
    if with_season:
        for i in range(11):
            X = np.c_[
                X,
                np.logical_and(month >= i, month < i+1)
            ]
            names.append('month_%s' % (i+1, ))
            
    X = sm.add_constant(X)

    if with_ar: 
        model = sm.GLSAR(y, X, missing='drop', rho=1) # autocorrelation of order 1
        fit = model.fit(cov_type='HC0') # Heteroskedasticity robust standard errors
    else:
        model = sm.OLS(y, X, missing='drop') # no autocorrelation
        fit = model.fit(cov_type='HAC', cov_kwds={'maxlags':1})
        # Heteroskedasticity and autocorrelation (# 1 lag, see ACF) robust standard errors
            
    #fit = model.fit()
    return fit, names