In [1]:
# this is a list of packages that are used in this notebook
# these come with python
import io
import zipfile
import functools
import bisect
import datetime
import pathlib
import uuid
import json

# you can install these packages using pip or anaconda
# (requests numpy pandas bokeh pyproj statsmodels)

# for downloading
import requests
import netCDF4

# computation libraries
import numpy as np
import pandas as pd
import geopandas
import simplejson
import pandas as pd

# coordinate systems
import pyproj 
import shapely.geometry


# Tide gauges
This notebook converts data from PSMSL and makes it available @ Google Cloud Storage for the sea-level rise viewer. 

In [2]:
psmsl_urls = {
    'met_monthly': 'http://www.psmsl.org/data/obtaining/met.monthly.data/met_monthly.zip',
    'rlr_monthly': 'http://www.psmsl.org/data/obtaining/rlr.monthly.data/rlr_monthly.zip',
    'rlr_annual': 'http://www.psmsl.org/data/obtaining/rlr.annual.data/rlr_annual.zip'
}
psmsl_files = {
    'met_monthly': pathlib.Path('~/src/sealevel/data/psmsl/met_monthly.zip').expanduser(),
    'rlr_monthly': pathlib.Path('~/src/sealevel/data/psmsl/rlr_monthly.zip').expanduser(),
    'rlr_annual': pathlib.Path('~/src/sealevel/data/psmsl/rlr_annual.zip').expanduser()   
}
default_dataset_name = 'rlr_annual'
data_dir = pathlib.Path('~/src/sealevel/data/psmsl/gcs').expanduser()
quantity = "sea_surface_height"


# Convert to DataFrame

In [3]:
# open the zipfile
zipfiles = {}

# store reference to zip files, just keep them open
for dataset_name in psmsl_files:
    stream = open(psmsl_files[dataset_name], 'rb')
    zf = zipfile.ZipFile(stream)
    zipfiles[dataset_name] = zf

# this list contains a table of 
# station ID, latitude, longitude, station name, coastline code, station code, and quality flag
csvtext = zipfiles[dataset_name].read('{}/filelist.txt'.format(dataset_name))
# read all the data
stations = pd.read_csv(
    io.BytesIO(csvtext), 
    sep=';',
    names=('id', 'lat', 'lon', 'name', 'coastline_code', 'station_code', 'quality'),
    converters={
        'name': str.strip,
        'quality': str.strip
    }
)
stations = stations.set_index('id')


In [4]:
psmsl_files

{'met_monthly': PosixPath('/Users/baart_f/src/sealevel/data/psmsl/met_monthly.zip'),
 'rlr_monthly': PosixPath('/Users/baart_f/src/sealevel/data/psmsl/rlr_monthly.zip'),
 'rlr_annual': PosixPath('/Users/baart_f/src/sealevel/data/psmsl/rlr_annual.zip')}

In [5]:
# all relevant urls
names = {
    'datum': '{dataset_name}/RLR_info/{id}.txt',
    'diagram': '{dataset_name}/RLR_info/{id}.png',
    'diagram_url': 'http://www.psmsl.org/data/obtaining/rlr.diagrams/{id}.php',
    'url': 'http://www.psmsl.org/data/obtaining/stations/{id}.php',
    'rlr_monthly': '{dataset_name}/data/{id}.rlrdata',
    'rlr_annual': '{dataset_name}/data/{id}.rlrdata',
    'met_monthly': '{dataset_name}/data/{id}.metdata',
    'doc': '{dataset_name}/docu/{id}.txt',
    'contact': '{dataset_name}/docu/{id}_auth.txt'
}

In [6]:
# add urls to the dataset
def get_url(station):
    """return the url of the station information (diagram and datum)"""
    url = names['url'].format(id=station.name)
    return url

# compute the url for each station
stations['url'] = stations.apply(get_url, axis=1)

# create new columns
for dataset_name in psmsl_urls:
    stations[dataset_name + '_url'] = None

for i, station in stations.iterrows():
    for dataset_name in psmsl_urls:
        uuid_ = str(uuid.uuid3(uuid.NAMESPACE_OID, str(station.name) + "_" + dataset_name + '_' + quantity))
        url = uuid_ + '.json'
        stations.loc[i, dataset_name + '_url'] = url
stations.head()

Unnamed: 0_level_0,lat,lon,name,coastline_code,station_code,quality,url,met_monthly_url,rlr_monthly_url,rlr_annual_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,48.38285,-4.494838,BREST,190,91,N,http://www.psmsl.org/data/obtaining/stations/1...,68cde77a-e39f-3234-bc03-6c1823ff5b3f.json,225d66c2-4e14-38ad-ac37-d1c293eb55f4.json,44ee6bf2-aa96-3176-a666-538618f9a8c1.json
2,53.916667,14.233333,SWINOUJSCIE,110,92,N,http://www.psmsl.org/data/obtaining/stations/2...,aa6b9e00-1026-3ba1-b175-d8b564c19da2.json,0a8386ac-9c5c-35ae-a02e-957f4805ad93.json,b68173a3-841d-32ee-97c7-c391b2ff8c67.json
3,51.445639,0.743444,SHEERNESS,170,101,N,http://www.psmsl.org/data/obtaining/stations/3...,2ff0fcda-06a3-3b5e-92c5-f10a3960ef21.json,c67b3c75-3b30-33ad-adb2-62e37df65017.json,12974b4a-c97e-3037-b177-ace03f4098e2.json
5,53.313944,-4.620444,HOLYHEAD,170,191,Y,http://www.psmsl.org/data/obtaining/stations/5...,60a74295-9514-3c91-ba09-2d0dae722775.json,35c694f2-9132-38b7-81c6-4c227ada2298.json,53db2d23-26b2-3145-b492-9c6f5ac7880e.json
7,53.866667,8.716667,CUXHAVEN 2,140,12,N,http://www.psmsl.org/data/obtaining/stations/7...,fbc469e8-8396-3dbb-96c2-ef3da7f7c208.json,bf66ea18-18a8-3ae7-bf8c-f09ce4a24a1d.json,f4912296-02d6-3200-b334-0e35f5041d69.json


In [7]:
# convert to geodataframe
stations['coordinate'] = list(zip(stations.lon, stations.lat))
stations['geometry'] = stations['coordinate'].apply(shapely.geometry.Point)
stations = geopandas.GeoDataFrame(stations, geometry='geometry')

# GeoJSON
We transform the database to a geoson file type, to facilitate the reading of the data on a webserver

In [8]:
columns=['lat', 'lon', 'name', 'coastline_code', 'station_code', 'quality', 'coordinate', 'met_monthly_url', 'rlr_monthly_url', 'rlr_annual_url', 'geometry']
text=stations[columns].to_json()


path_output = data_dir / 'locations.geojson'

with path_output.open('w') as f:
    f.write(text)

In [9]:
def station2location(station):
    result = {
        "uuid": str(uuid.uuid3(uuid.NAMESPACE_OID, str(station.name))),
        "url": station.url,
        "code": station.name,
        "name": station['name'],
        "geometry": station.geometry.__geo_interface__,
        "node": {
            "uuid": str(uuid.uuid3(uuid.NAMESPACE_URL, "https://s3-eu-west-1.amazonaws.com/deltares-opendata")),
            "name": "Deltares",
            "description": "PSMSL data location hosted by Deltares",
            "baseUrl": "https://s3-eu-west-1.amazonaws.com/deltares-opendata"
        }
    }
    return result

In [10]:
# We use response to frame that we use the data from the web after a query. We put the data in a dictionary to fit with the webviewer
results = stations.apply(station2location, axis=1)
locations_response = {
    "results": list(results),
    "count": len(stations.index),
    "maxPageSize": None,
    "previous": None,
    "next": None
}

In [11]:
class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
            np.int16, np.int32, np.int64, np.uint8,
            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, 
            np.float64)):
            return float(obj)
        elif isinstance(obj,(np.ndarray,)): #### This is the fix
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
    
with (data_dir / 'locations.json').open('w') as f:
    json.dump(locations_response, f, cls=NumpyEncoder )

# Timeseries

In [12]:
# use nans for missing, a bit more performant than masked arrays
def missing2nan(value, missing=-99999):
    """convert the value to nan if the float of value equals the missing value"""
    value = float(value)
    if value == missing:
        return np.nan
    return value

def year2date(year_fraction, dtype='datetime64[s]'):
    """convert a fraction of a year + fraction of a year to a date, for example 1993.12 -> 1993-02-01.
    The dtype should be a valid numpy datetime unit, such as datetime64[s]"""
    startpoints = np.linspace(0, 1, num=12, endpoint=False)
    remainder = np.mod(year_fraction, 1)
    year = np.floor_divide(year_fraction, 1).astype('int')
    month = np.searchsorted(startpoints, remainder)
    if (month == 0).all():
        # if month is set to 0 (for annual data), set to january
        month = np.ones_like(month)
    dates = [
        datetime.datetime(year_i, month_i, 1) 
        for year_i, month_i 
        in zip(year, month)
    ]
    datetime64s = np.asarray(dates, dtype=dtype)
    return datetime64s

def get_data(station, dataset_name):
    """get data for the station (pandas record) from the dataset (url)"""
    info = dict(
        dataset_name=dataset_name,
        id=station.name
    )
    bytes = zipfiles[dataset_name].read(names[dataset_name].format(**info))
    df = pd.read_csv(
        io.BytesIO(bytes), 
        sep=';', 
        names=('year', 'height', 'interpolated', 'flags'),
        converters={
            "height": lambda x: missing2nan(x),
            "interpolated": str.strip,
        }
    )
    df['station'] = station.name
    df['t'] = year2date(df.year)
    df = df.set_index('t')
    return df

In [13]:
# get data for all stations
for dataset_name in psmsl_urls:
    f = functools.partial(get_data, dataset_name=dataset_name)
    # look up the data for each station
    stations[dataset_name] = [f(station) for _, station in stations.iterrows()]

In [14]:
def station2events(station,dataset_name):
    timeseries=station[dataset_name]
    df=timeseries[["height"]]    
    result = [
        {
            "timeStamp": str(row[0]),
            "value": row[1].height
        }
        for row  
        in df.iterrows()
    ]
    return result

In [15]:
def station2timeseries(station,dataset_name, quantity="sea_surface_height"):
    result = {        
        "url": None,
        "uuid": str(uuid.uuid3(uuid.NAMESPACE_OID, str(station.name) + "_" + dataset_name + '_' + quantity)),
        "qualifier": str(station.name) + "_" + dataset_name + '_' + quantity,
        "location": station2location(station),
        "observationType": {
            "uuid": str(uuid.uuid3(uuid.NAMESPACE_OID, quantity)),
            "quantity": quantity,
            "unit": "mm",
            "parameterCode": None,
            "compartment": None,
            "qualifier": None,
            "extra": [],
        },
        "node": {
            "uuid": str(uuid.uuid3(uuid.NAMESPACE_URL, "https://s3-eu-west-1.amazonaws.com/deltares-opendata")),
            "name": "Deltares",
            "description": "PSMSL data location hosted by Deltares",
            "baseUrl": "https://s3-eu-west-1.amazonaws.com/deltares-opendata"
        },
        "datasource": {
            "uuid": str(uuid.uuid3(uuid.NAMESPACE_OID, "PSMSL")),
            "name": 'PSMSL',
            "node": {
                "uuid": str(uuid.uuid3(uuid.NAMESPACE_URL, "https://www.psmsl.org")),
                "name": 'PSMSL',
                "description": 'Permanent Service for Mean Sea Level',
                "baseUrl": 'http://www.psmsl.org/'
                }
            },
        "timeseriesType": {
            "code": None,
            "name": "Measurements"
        },
        "interval": None,
        "valueType": "float",
        "start": None,
        "end": None,
        "events": station2events(station,dataset_name)
    }
    return result

In [16]:
station = stations.iloc[0]
for i, station in stations.iterrows():
    for dataset_name in psmsl_urls:
        result = station2timeseries(station, dataset_name)
        path = (data_dir / result['uuid']).with_suffix('.json')
        with path.open('w') as f:
            simplejson.dump(result, f, ignore_nan=True)
