### pre-process the JMA hindcasts, step 1, read grib, change variable and dimension names and saves to netcdf 

### note: need to use pygrib as xarray does not read all the dimensions 

In [112]:
# Parameters
stage_in = "downloads"
stage_out = "pre_processed"
period = "hindcasts"
provider = "JMA"
GCM = "JMA"
var_name = "PRECIP"

In [113]:
%matplotlib inline
import matplotlib.pyplot as plt

In [114]:
from datetime import datetime, timedelta

In [115]:
import pygrib
import numpy as np
import pandas as pd

In [116]:
import xarray as xr; print(xr.__version__)

0.15.0


In [117]:
import dask 

In [118]:
import os

In [119]:
import pathlib

----- 

### paths definitions 

In [120]:
HOME = pathlib.Path.home()

In [121]:
HOME

PosixPath('/home/nicolasf')

### input path 

In [122]:
ipath = HOME / 'drives' / 'auck_projects' / 'END19101' / 'Working' / 'data' / 'GCMs' / stage_in / period / provider / GCM / var_name

### final output path 

In [123]:
opath = HOME / 'drives' / 'auck_projects' / 'END19101' / 'Working' / 'data' / 'GCMs' / stage_out / period / provider / GCM / var_name

### creates the output path if it doesnt already exists 

In [124]:
if not opath.exists(): 
    opath.mkdir(parents=True)

### list ALL the files 

In [125]:
lfiles = list(ipath.glob('*'))

In [126]:
lfiles.sort()

In [127]:
lfiles[0]

PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201501')

In [128]:
lfiles[-1]

PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.20141227.grb2')

### need to separate between the Ptt_em (ensemble mean) and the Ptt_mon (all members) files 

In [129]:
if var_name == 'T2M': 
    lfiles_em = [x for x in lfiles if "Ptt_em" in str(x)]
    lfiles_mon = [x for x in lfiles if "Ptt_mon" in str(x)] 
elif var_name == 'PRECIP': 
    lfiles_em = [x for x in lfiles if "Prr_em" in str(x)]
    lfiles_mon = [x for x in lfiles if "Prr_mon" in str(x)]
elif var_name == 'Z500': 
    lfiles_em = [x for x in lfiles if "Phh_em" in str(x)]
    lfiles_mon = [x for x in lfiles if "Phh_mon" in str(x)]  

In [130]:
lfiles_em.sort()

In [131]:
lfiles_mon.sort()

In [132]:
lfiles_em

[PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201501'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201502'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201503'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201504'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201505'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201506'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_em.201507'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/d

In [133]:
lfiles_mon[5]

PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790327.grb2')

In [134]:
lfiles_mon[-5:]

[PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.20141028.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.20141112.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.20141127.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.20141212.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.20141227.grb2')]

### we only keep post 1979 

In [135]:
if '1974' in str(lfiles_mon[0]): 
    lfiles_mon = lfiles_mon[5:]

In [136]:
lfiles_mon[:5]

[PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790116.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790131.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790210.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790225.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790312.grb2')]

### now select only the first initialisations of each month 

In [137]:
dates =  [datetime.strptime(fname.name.split('.')[-2], "%Y%m%d") for fname in lfiles_mon]

In [138]:
index = pd.Series(np.arange(len(dates)), index = dates) 

In [139]:
index = index[(index.index.day >= 10) & (index.index.day <= 16)]

In [140]:
index

1979-01-16      0
1979-02-10      2
1979-03-12      4
1979-04-11      6
1979-05-16      8
             ... 
2014-08-14    854
2014-09-13    856
2014-10-13    858
2014-11-12    860
2014-12-12    862
Length: 432, dtype: int64

In [141]:
lfiles_mon = np.array(lfiles_mon)[index.values]

In [142]:
lfiles_mon = lfiles_mon.tolist()

### loops over the files in `lfiles_mon` and process 

In [None]:
for fname in lfiles_mon: 

    print(f"processing {fname}")
    
    # get the date: 
    date_orig = datetime.strptime(fname.name.split('.')[-2], "%Y%m%d")
    
    date_init = datetime(date_orig.year, date_orig.month, 1)
    
    out_fname = f"{GCM}_{var_name}_{date_init:%Y_%m}.nc"
    
    out_fname = opath / out_fname

    grbs = pygrib.open(str(fname))

    analDates = []
    validDates = []
    validityDates = []
    data = []

    for g in grbs: 
        data.append(g.values)
        analDates.append(g.analDate)
        validDates.append(g.validDate)
        validityDates.append(datetime.strptime(str(g.validityDate), "%Y%m%d"))   

    data = np.array(data)

    data_reshaped = np.reshape(data, (len(np.unique(validDates)), data.shape[0] // len(np.unique(validDates)), data.shape[-2], data.shape[-1]))

    data_reshaped = data_reshaped[np.newaxis, ...]

    lats,lons = g.latlons()

    lats = lats[:,0]
    lons = lons[0,:]

    nmembers =  g.numberOfForecastsInEnsemble

    if nmembers != data_reshaped.shape[2]:
        print('oups, data shape doesnt match the number of ensemble members')
    if len(np.unique(validDates)) != 7: 
        print('oups, issue with the number of steps')

    d = {}

    d['time'] = (('time'), np.array(date_init).reshape(1,))
    d['step'] = (('step'), [1,2,3,4,5,6,7])
    d['lat'] = (('lat'), lats)
    d['lon'] = (('lon'), lons)
    d['member'] = (('member'), list(range(0, nmembers)))
    d[var_name.lower()] = (('time', 'step', 'member', 'lat','lon'), data_reshaped)
    
    dset = xr.Dataset(d)

    dset = dset.sel(step=slice(1,3))
    
    # calculate the ensemble mean as the files contained in Ptt_em list do not have the members 

    dset = dset.mean('member')
    
    dset = dset.sortby('lat')

    ### mask the land values (np.nan)

    dset[var_name.lower()] = dset[var_name.lower()].where(dset[var_name.lower()] < -9999, np.nan)

    dset.to_netcdf(out_fname, unlimited_dims='time')

    dset.close()


processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790116.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790210.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/PRECIP/surf_Prr_mon.19790312.grb2


### now process the other files to 2016 

In [None]:
for fname in lfiles_em: 

    print(f"processing {fname}")

    date_orig = datetime.strptime(fname.name.split('.')[-1], "%Y%m")

    date_init = datetime(date_orig.year, date_orig.month, 1)

    out_fname = f"{GCM}_{var_name}_{date_init:%Y_%m}.nc"

    out_fname = opath / out_fname

    grbs = pygrib.open(str(fname))

    analDates = []
    validDates = []
    validityDates = []
    data = []

    for g in grbs: 
        data.append(g.values)
        analDates.append(g.analDate)
        validDates.append(g.validDate)
        validityDates.append(datetime.strptime(str(g.validityDate), "%Y%m%d"))   

    data = np.array(data)

    data = data[1:,...]

    data_reshaped = data[np.newaxis, ...]

    lats,lons = g.latlons()

    lats = lats[:,0]
    lons = lons[0,:]

    d = {}

    d['time'] = (('time'), np.array(date_init).reshape(1,))
    d['step'] = (('step'), [1,2,3])
    d['lat'] = (('lat'), lats)
    d['lon'] = (('lon'), lons)
    d[var_name.lower()] = (('time', 'step', 'lat','lon'), data_reshaped)


    dset = xr.Dataset(d)

    dset = dset.sortby('lat')
    
    ### mask the land values (np.nan)

    # dset[var_name.lower()] = dset[var_name.lower()].where(dset[var_name.lower()] < -9999, np.nan)

    dset.to_netcdf(out_fname, unlimited_dims='time')

    dset.close()
