### pre-process the JMA hindcasts, step 2, adopt the same file structure and convention as the CDS

In [21]:
%matplotlib inline
import matplotlib.pyplot as plt

In [22]:
from datetime import datetime, timedelta

In [23]:
import pygrib
import numpy as np
import pandas as pd

In [24]:
import xarray as xr; print(xr.__version__)

0.14.0


In [25]:
import dask

In [26]:
import os

In [27]:
import pathlib

In [107]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [108]:
dpath = pathlib.Path('/home/nicolasf/drives/auck_projects/END19101/Working/data/hindcasts/JMA/z850/')

In [109]:
lfiles = list(dpath.glob("*.nc"))

In [110]:
len(lfiles)

864

In [111]:
lfiles.sort()

In [112]:
dset = xr.open_mfdataset(lfiles, combine='by_coords')

In [113]:
dset

<xarray.Dataset>
Dimensions:    (latitude: 73, longitude: 144, number: 5, step: 7, time: 864)
Coordinates:
  * step       (step) int64 1 2 3 4 5 6 7
  * latitude   (latitude) float64 90.0 87.5 85.0 82.5 ... -85.0 -87.5 -90.0
  * number     (number) int64 0 1 2 3 4
  * longitude  (longitude) float64 0.0 2.5 5.0 7.5 ... 350.0 352.5 355.0 357.5
  * time       (time) datetime64[ns] 1979-01-16 1979-01-31 ... 2014-12-27
Data variables:
    gh         (time, step, number, latitude, longitude) float64 dask.array<chunksize=(1, 7, 5, 73, 144), meta=np.ndarray>

In [114]:
index = dset.time.to_series()

In [115]:
index = index[(index.index.day >= 10) & (index.index.day <= 16)]

In [116]:
dset = dset.sel(time=index.index)

In [94]:
dset.nbytes / 1e9

1.271536808

In [117]:
dset

<xarray.Dataset>
Dimensions:    (latitude: 73, longitude: 144, number: 5, step: 7, time: 432)
Coordinates:
  * step       (step) int64 1 2 3 4 5 6 7
  * latitude   (latitude) float64 90.0 87.5 85.0 82.5 ... -85.0 -87.5 -90.0
  * number     (number) int64 0 1 2 3 4
  * longitude  (longitude) float64 0.0 2.5 5.0 7.5 ... 350.0 352.5 355.0 357.5
  * time       (time) datetime64[ns] 1979-01-16 1979-02-10 ... 2014-12-12
Data variables:
    gh         (time, step, number, latitude, longitude) float64 dask.array<chunksize=(1, 7, 5, 73, 144), meta=np.ndarray>

### normalise so that the initialisation date is the first of each month, we will keep the original 'real' initialisation dates in a separate variable 

In [118]:
init_dates = dset.time

In [119]:
init_dates = init_dates.to_index()

In [120]:
init_dates_fl = [datetime(x.year, x.month, 1) for x in init_dates]

In [122]:
dset['time'] = (('time'), init_dates_fl)

### calculates the rolling averages over the `step` dimension 

In [124]:
dset_rm = dset.rolling({'step':3}, min_periods=3).mean()

In [125]:
dset_rm

<xarray.Dataset>
Dimensions:    (latitude: 73, longitude: 144, number: 5, step: 7, time: 432)
Coordinates:
  * step       (step) int64 1 2 3 4 5 6 7
  * latitude   (latitude) float64 90.0 87.5 85.0 82.5 ... -85.0 -87.5 -90.0
  * number     (number) int64 0 1 2 3 4
  * longitude  (longitude) float64 0.0 2.5 5.0 7.5 ... 350.0 352.5 355.0 357.5
  * time       (time) datetime64[ns] 1979-01-01 1979-02-01 ... 2014-12-01
Data variables:
    gh         (time, step, number, latitude, longitude) float64 dask.array<chunksize=(1, 1, 5, 73, 144), meta=np.ndarray>

In [128]:
dset_rm = dset_rm.sel(step=slice(3,None))

In [129]:
dset_rm

<xarray.Dataset>
Dimensions:    (latitude: 73, longitude: 144, number: 5, step: 5, time: 432)
Coordinates:
  * step       (step) int64 3 4 5 6 7
  * latitude   (latitude) float64 90.0 87.5 85.0 82.5 ... -85.0 -87.5 -90.0
  * number     (number) int64 0 1 2 3 4
  * longitude  (longitude) float64 0.0 2.5 5.0 7.5 ... 350.0 352.5 355.0 357.5
  * time       (time) datetime64[ns] 1979-01-01 1979-02-01 ... 2014-12-01
Data variables:
    gh         (time, step, number, latitude, longitude) float64 dask.array<chunksize=(1, 5, 5, 73, 144), meta=np.ndarray>

### creates a variable `forecast_valid_time` which varies along the dimension time and step and contains the forecast validity dates 

In [130]:
lt_arr = []
for t in dset_rm.time.data: 
    lt = np.array([pd.to_datetime(t) + relativedelta(months=3), \
                   pd.to_datetime(t) + relativedelta(months=4), \
                   pd.to_datetime(t) + relativedelta(months=5), 
                  pd.to_datetime(t) + relativedelta(months=6), 
                  pd.to_datetime(t) + relativedelta(months=7)])
    lt_arr.append(lt)
lt_arr = np.array(lt_arr)
dset_rm['forecast_valid_time'] = (('time','step'), lt_arr)

In [131]:
dset_rm

<xarray.Dataset>
Dimensions:              (latitude: 73, longitude: 144, number: 5, step: 5, time: 432)
Coordinates:
  * step                 (step) int64 3 4 5 6 7
  * latitude             (latitude) float64 90.0 87.5 85.0 ... -85.0 -87.5 -90.0
  * number               (number) int64 0 1 2 3 4
  * longitude            (longitude) float64 0.0 2.5 5.0 ... 352.5 355.0 357.5
  * time                 (time) datetime64[ns] 1979-01-01 ... 2014-12-01
Data variables:
    gh                   (time, step, number, latitude, longitude) float64 dask.array<chunksize=(1, 5, 5, 73, 144), meta=np.ndarray>
    forecast_valid_time  (time, step) datetime64[ns] 1979-04-01 ... 2015-07-01