# Structural breaks

This document contains some methods for analysing trend breaks in the regression model with modelled storm surge.

To be more specific, the notebook considers the following three models:

- the linear model with trend, 

- the linear model with trend break at a fixed time and 

- the quadratic model 

The output of this notebook is saved as a R dataframe

#### Remark

The OLS-based CUSUM test is in a separate R notebook as it requires the strucchange package, which seems to be more appropriate and better documented than what is in Python's statsmodels module and user uploaded modules.

In [1]:
# Standard Python packages
import io
import os

# Python packages that need to be installed using pip or anaconda:
# For computations
import pandas
import numpy as np

# For plotting
import matplotlib.pyplot as plt
import bokeh.palettes
import bokeh.plotting
from bokeh.models import HoverTool

# Initialize modules for the jupyter notebook format
from nbformat import v4
%matplotlib inline
bokeh.io.output_notebook()

from IPython.display import display
import shutil

import statsmodels.graphics.regressionplots as plots

# Disable pandas warnings
pandas.options.mode.chained_assignment = None


- use nbformat for read/write/validate public API
- use nbformat.vX directly to composing notebooks of a particular version

  """)


In [2]:
def execute_notebook(nbfile):
    """Function to run other notebook in this notebook"""
    with io.open(nbfile,encoding="utf8") as f:
        nb = v4.reads_json(f.read())
    
    ip = get_ipython()
    
    for cell in nb.cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.source)

In [3]:
# Load notebook with basic io functionality (wind, PSMSL) and standard linear model
execute_notebook('../satellite/get-data.ipynb')

In [4]:
def linear_model_with_surge(df):
    """
    Return the fit from the linear model on the given dataset df.
    Wind and season can be enabled and disabled
    """
    y = df['height']
    X = np.c_[
        df['year']-1970, 
        np.cos(2*np.pi*(df['year']-1970)/18.613),
        np.sin(2*np.pi*(df['year']-1970)/18.613),
        df['surge']# * (df['year'] >= 1979)    
    ]
    names = ['Constant', 'Trend', 'Nodal U', 'Nodal V', 'Surge']
        
    X = sm.add_constant(X)
    model = sm.OLS(y, X, missing='drop')
    fit = model.fit(cov_type='HC0')
    #fit = model.fit(cov_type='HC0')
    return fit, names

In [5]:
# define the statistical model
def quadratic_model(df):
    """This model computes a parabolic linear fit. This corresponds to the hypothesis that sea-level is accelerating."""
    y = df['height']
    X = np.c_[
        df['year']-1970, 
        (df['year'] - 1970) * (df['year'] - 1970),
        np.cos(2*np.pi*(df['year']-1970)/18.613),
        np.sin(2*np.pi*(df['year']-1970)/18.613),
        df['surge']
    ]
    names = ['Constant', 'Trend', 'Trend^2', 'Nodal U', 'Nodal V', 'Surge']
    X = sm.add_constant(X)
    model_quadratic = sm.OLS(y, X)
    fit = model_quadratic.fit(cov_type='HC0')
    
    return fit, names


In [6]:
# define the statistical model
def broken_linear_model(df, break_year=1993):
    """This model fits the sea-level rise has started to rise faster in 1993."""
    y = df['height']
    X = np.c_[
        df['year']-1970, 
        (df['year'] > break_year) * (df['year'] - break_year),
        np.cos(2*np.pi*(df['year']-1970)/18.613),
        np.sin(2*np.pi*(df['year']-1970)/18.613),
        df['surge']
    ]
    names = ['Constant', 'Trend', 'Add. trend after', 'Nodal U', 'Nodal V', 'Surge']
    X = sm.add_constant(X)
    model_broken_linear = sm.OLS(y, X)
    fit = model_broken_linear.fit(cov_type='HC0')
    return fit, names


In [7]:
# laad eerste het model met de summary
station_names = [
    'Vlissingen', 
    'Hoek van Holland', 
    'Den Helder', 
    'Delfzijl', 
    'Harlingen', 
    'IJmuiden'
]

# Locatie wind data is 50 km uit kust vanuit IJmuiden
rlr_data = get_station_data(dataset_name='rlr_annual', coastline_code=150, names=station_names, include_wind=False)

rlr_annual 20 rlr
rlr_annual 22 rlr
rlr_annual 23 rlr
rlr_annual 24 rlr
rlr_annual 25 rlr
rlr_annual 32 rlr


In [8]:
stations = [20, 22, 23, 24, 25, 32]

grouped = pandas.concat(rlr_data.loc[stations, 'data'].tolist())[['year', 'height']].groupby(['year'])
mean_df = grouped.mean().reset_index()
# filter out non-trusted part (before NAP)
mean_df = mean_df[mean_df['year'] >= 1890]

station_names = [st.capitalize() for st in rlr_data.loc[stations, 'name'].tolist()]

print(f'The sea water level data of the following stations are analyzed: {", ".join(station_names)}')

The sea water level data of the following stations are analyzed: Vlissingen, Hoek van holland, Den helder, Delfzijl, Harlingen, Ijmuiden


In [9]:
import getpass
username = getpass.getuser()
print(username)

if username == 'rongen': 
    surgepath = 'D:/Documents/2695.50 Zeespiegelstijging 2018/Data/'
elif username == 'nicolai':
    surgepath = 'D:/Users/' + username + '/Documents/2695.50 Zeespiegelstijging 2018/Data/'
    print(surgepath)
# else:
#    print('enter path where surge is stored using forward slashes')
#    surgepath = input()
#    print(surgepath)
#     print('path not valid')

# print(surgepath + 'surge.pkl')   

#'D:\Users\Nicolai\Documents\2695.50 Zeespiegelstijging 2018\Data'

# Add surge
# Load surge and convert from meters to mm
surge = pandas.read_pickle(os.path.join(surgepath, 'surge.pkl')) * 1000
display(surge.head(5))
      
display(surge.tail(5))

nicolai
D:/Users/nicolai/Documents/2695.50 Zeespiegelstijging 2018/Data/


Unnamed: 0,Vlissingen,Hoek van holland,Den helder,Delfzijl,Harlingen,Ijmuiden
1979-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0
1979-01-01 00:10:00,-0.249468,0.590325,0.806396,-3.677294,-2.944402,-0.753033
1979-01-01 00:20:00,-0.89001,1.760389,3.596331,-11.441492,-5.254048,-2.288197
1979-01-01 00:30:00,-1.303061,2.307753,7.541155,-16.544901,-4.505036,-3.55656
1979-01-01 00:40:00,-1.140522,1.127529,10.611323,-14.176514,-5.693127,-4.844942


Unnamed: 0,Vlissingen,Hoek van holland,Den helder,Delfzijl,Harlingen,Ijmuiden
2014-12-31 23:10:00,-177.668601,-153.670296,-95.220782,15.352673,-68.900131,-110.704631
2014-12-31 23:20:00,-181.090802,-151.775345,-74.545339,10.494649,-63.965112,-108.435169
2014-12-31 23:30:00,-181.896344,-149.892956,-74.131303,6.929162,-56.788363,-105.52229
2014-12-31 23:40:00,-184.564605,-149.479315,-80.773443,2.179859,-46.634056,-100.702301
2014-12-31 23:50:00,-190.054506,-150.883034,-74.545391,-7.403774,-41.788533,-96.721426


In [10]:
# Calculate average over given stations, per year
# if monthly averages are needed, groupby "surge.index.month"
average = surge[station_names].groupby(surge.index.year).mean().mean(axis=1)
# Construct dataframe and add to mean_df
surge_per_year = pandas.DataFrame(data=[average.mean()] * len(mean_df), index=mean_df['year'], columns=['surge'])
surge_per_year.loc[average.index, 'surge'] = average.values
surge_per_year.index.name = 'year'
if 'surge' not in mean_df.columns:
    mean_df = mean_df.merge(surge_per_year.reset_index(), on='year')

# Create a corrected dataframe by subtracting the surge
mean_df_corrected = mean_df.copy()
mean_df_corrected['height'] -= mean_df_corrected['surge']   


In [11]:
fits = {}
tables = {}
models = {
    'linear': linear_model_with_surge,
    'quadratic': quadratic_model,
    'broken_linear': broken_linear_model
}

for model in models.keys():

    fits[model], names = models[model](mean_df)
    tables[model] = fits[model].summary(
        yname='Sea-surface height', 
        xname=names,
        title=f'{model.capitalize()} model'
    )
    
    display(tables[model])

0,1,2,3
Dep. Variable:,Sea-surface height,R-squared:,0.892
Model:,OLS,Adj. R-squared:,0.889
Method:,Least Squares,F-statistic:,410.7
Date:,"Tue, 28 Aug 2018",Prob (F-statistic):,4.0500000000000005e-70
Time:,17:27:44,Log-Likelihood:,-593.96
No. Observations:,128,AIC:,1198.0
Df Residuals:,123,BIC:,1212.0
Df Model:,4,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Constant,-23.1142,2.235,-10.341,0.000,-27.495,-18.733
Trend,1.9267,0.055,34.898,0.000,1.819,2.035
Nodal U,4.5272,3.113,1.454,0.146,-1.574,10.629
Nodal V,-10.8684,3.193,-3.403,0.001,-17.127,-4.610
Surge,0.9018,0.069,13.071,0.000,0.767,1.037

0,1,2,3
Omnibus:,3.042,Durbin-Watson:,1.505
Prob(Omnibus):,0.219,Jarque-Bera (JB):,2.517
Skew:,-0.258,Prob(JB):,0.284
Kurtosis:,3.454,Cond. No.,57.8


0,1,2,3
Dep. Variable:,Sea-surface height,R-squared:,0.894
Model:,OLS,Adj. R-squared:,0.89
Method:,Least Squares,F-statistic:,338.6
Date:,"Tue, 28 Aug 2018",Prob (F-statistic):,1e-69
Time:,17:27:44,Log-Likelihood:,-592.92
No. Observations:,128,AIC:,1198.0
Df Residuals:,122,BIC:,1215.0
Df Model:,5,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Constant,-25.9492,3.326,-7.801,0.000,-32.469,-19.429
Trend,2.0126,0.073,27.660,0.000,1.870,2.155
Trend^2,0.0026,0.002,1.420,0.156,-0.001,0.006
Nodal U,4.3966,3.089,1.423,0.155,-1.657,10.451
Nodal V,-10.9662,3.189,-3.439,0.001,-17.216,-4.716
Surge,0.9087,0.069,13.238,0.000,0.774,1.043

0,1,2,3
Omnibus:,1.84,Durbin-Watson:,1.528
Prob(Omnibus):,0.398,Jarque-Bera (JB):,1.344
Skew:,-0.191,Prob(JB):,0.511
Kurtosis:,3.325,Cond. No.,3420.0


0,1,2,3
Dep. Variable:,Sea-surface height,R-squared:,0.899
Model:,OLS,Adj. R-squared:,0.894
Method:,Least Squares,F-statistic:,362.4
Date:,"Tue, 28 Aug 2018",Prob (F-statistic):,2.1e-71
Time:,17:27:44,Log-Likelihood:,-590.25
No. Observations:,128,AIC:,1192.0
Df Residuals:,122,BIC:,1210.0
Df Model:,5,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Constant,-28.2819,2.864,-9.875,0.000,-33.895,-22.668
Trend,1.8008,0.071,25.462,0.000,1.662,1.939
Add. trend after,1.3277,0.385,3.446,0.001,0.573,2.083
Nodal U,4.7971,3.003,1.597,0.110,-1.089,10.683
Nodal V,-11.4149,3.157,-3.615,0.000,-17.603,-5.227
Surge,0.9221,0.060,15.244,0.000,0.804,1.041

0,1,2,3
Omnibus:,2.489,Durbin-Watson:,1.592
Prob(Omnibus):,0.288,Jarque-Bera (JB):,1.973
Skew:,-0.22,Prob(JB):,0.373
Kurtosis:,3.421,Cond. No.,57.9


In [12]:
for model in models.keys():
    print(f'{model.capitalize()} model: AIC = {fits[model].aic:.2f}')

Linear model: AIC = 1197.92
Quadratic model: AIC = 1197.85
Broken_linear model: AIC = 1192.50
