### pre-process the JMA hindcasts, step 1, read grib, change variable and dimension names and saves to netcdf 

### note: need to use pygrib as xarray does not read all the dimensions 

In [1]:
# Parameters
stage_in = "downloads"
stage_out = "pre_processed"
period = "hindcasts"
provider = "JMA"
GCM = "JMA"
var_name = "Z500"

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
from datetime import datetime, timedelta

In [4]:
import pygrib
import numpy as np

In [5]:
import xarray as xr; print(xr.__version__)

0.15.0


In [6]:
import dask 

In [7]:
import os

In [8]:
import pathlib

----- 

### paths definitions 

In [9]:
HOME = pathlib.Path.home()

In [10]:
HOME

PosixPath('/home/nicolasf')

### input path 

In [11]:
ipath = HOME / 'drives' / 'auck_projects' / 'END19101' / 'Working' / 'data' / 'GCMs' / stage_in / period / provider / GCM / var_name

### final output path 

In [12]:
opath = HOME / 'drives' / 'auck_projects' / 'END19101' / 'Working' / 'data' / 'GCMs' / stage_out / period / provider / GCM / var_name

### creates the output path if it doesnt already exists 

In [59]:
if not opath.exists(): 
    opath.mkdir(parents=True)

### list ALL the files 

In [23]:
lfiles = list(ipath.glob('*'))

In [24]:
lfiles.sort()

In [25]:
lfiles[0]

PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201501')

In [26]:
lfiles[-1]

PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.20141227.grb2')

### need to separate between the Ptt_em (ensemble mean) and the Ptt_mon (all members) files 

In [27]:
if var_name == 'T2M': 
    lfiles_em = [x for x in lfiles if "Ptt_em" in str(x)]
    lfiles_mon = [x for x in lfiles if "Ptt_mon" in str(x)] 
elif var_name == 'PRECIP': 
    lfiles_em = [x for x in lfiles if "Prr_em" in str(x)]
    lfiles_mon = [x for x in lfiles if "Prr_mon" in str(x)]
elif var_name == 'Z500': 
    lfiles_em = [x for x in lfiles if "Phh_em" in str(x)]
    lfiles_mon = [x for x in lfiles if "Phh_mon" in str(x)]  

In [28]:
lfiles_em.sort()

In [29]:
lfiles_mon.sort()

In [30]:
lfiles_em

[PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201501'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201502'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201503'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201504'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201505'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201506'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_em.201507'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindc

In [45]:
lfiles_mon[5]

PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790116.grb2')

In [46]:
lfiles_mon[-5:]

[PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.20141028.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.20141112.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.20141127.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.20141212.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.20141227.grb2')]

### we only keep post 1979 

In [47]:
lfiles_mon = lfiles_mon[5:]

In [48]:
lfiles_mon[:5]

[PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790116.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790131.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790210.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790225.grb2'),
 PosixPath('/home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790312.grb2')]

### loops over the files in `lfiles_mon` and process 

In [60]:
for fname in lfiles_mon: 

    print(f"processing {fname}")

    out_fname = fname.name.replace('.grb2','.nc')
    out_fname = opath / out_fname

    init_date = datetime.strptime(fname.name.split('.')[-2], "%Y%m%d")

    grbs = pygrib.open(str(fname))

    analDates = []
    validDates = []
    validityDates = []
    data = []

    for g in grbs: 
        data.append(g.values)
        analDates.append(g.analDate)
        validDates.append(g.validDate)
        validityDates.append(datetime.strptime(str(g.validityDate), "%Y%m%d"))   

    data = np.array(data)

    data_reshaped = np.reshape(data, (len(np.unique(validDates)), data.shape[0] // len(np.unique(validDates)), data.shape[-2], data.shape[-1]))

    data_reshaped = data_reshaped[np.newaxis, ...]

    lats,lons = g.latlons()

    lats = lats[:,0]
    lons = lons[0,:]

    nmembers =  g.numberOfForecastsInEnsemble

    if nmembers != data_reshaped.shape[2]:
        print('oups, data shape doesnt match the number of ensemble members')
    if len(np.unique(validDates)) != 7: 
        print('oups, issue with the number of steps')

    d = {}

    d['time'] = (('time'), np.array(init_date).reshape(1,))
    d['step'] = (('step'), [1,2,3,4,5,6,7])
    d['lat'] = (('lat'), lats)
    d['lon'] = (('lon'), lons)
    d['member'] = (('member'), list(range(0, nmembers)))
    d[var_name.lower()] = (('time', 'step', 'member', 'lat','lon'), data_reshaped)


    dset = xr.Dataset(d)

    # calculate the ensemble mean as the files contained in Ptt_em list do not have the members 

    dset = dset.mean('member')

    ### mask the land values (np.nan)

    dset[var_name.lower()] = dset[var_name.lower()].where(dset[var_name.lower()] < -9999, np.nan)

    dset.to_netcdf(out_fname, unlimited_dims='time')

    dset.close()


processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790116.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790131.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790210.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790225.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790312.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790327.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Working/data/GCMs/downloads/hindcasts/JMA/JMA/Z500/p500_Phh_mon.19790411.grb2
processing /home/nicolasf/drives/auck_projects/END19101/Workin

### now process the other files to 2016 

In [None]:
for fname in lfiles_em: 

    print(f"processing {fname}")

    init_date = datetime.strptime(fname.name.split('.')[-1], "%Y%m")

    fname.name.replace('em','mon').split('.')[0] + f".{init_date:%Y%m%d}.nc"

    out_fname = opath / out_fname

    grbs = pygrib.open(str(fname))

    analDates = []
    validDates = []
    validityDates = []
    data = []

    for g in grbs: 
        data.append(g.values)
        analDates.append(g.analDate)
        validDates.append(g.validDate)
        validityDates.append(datetime.strptime(str(g.validityDate), "%Y%m%d"))   

    data = np.array(data)

    data = data[1:,...]

    data_reshaped = data[np.newaxis, ...]

    lats,lons = g.latlons()

    lats = lats[:,0]
    lons = lons[0,:]

    d = {}

    d['time'] = (('time'), np.array(init_date).reshape(1,))
    d['step'] = (('step'), [1,2,3])
    d['lat'] = (('lat'), lats)
    d['lon'] = (('lon'), lons)
    d[var_name.lower()] = (('time', 'step', 'lat','lon'), data_reshaped)


    dset = xr.Dataset(d)

    ### mask the land values (np.nan)

    # dset[var_name.lower()] = dset[var_name.lower()].where(dset[var_name.lower()] < -9999, np.nan)

    dset.to_netcdf(out_fname, unlimited_dims='time')

    # dset.close()
