In [1]:
import pandas as pd
import xarray as xr
from dateutil.relativedelta import relativedelta

%load_ext autoreload
%autoreload 2

### 1. Read data

In [5]:
# read hindcasts
list_hindcasts = []
#for model in ["CMCC", "DWD"]:
for model in ["CMCC", "DWD", "ECMWF", "JMA", "METFR", "UKMO", "ECCC", "ECCC_operational"]:
    df = xr.open_dataset("data/" + model + "/" + model.lower() + ".grib").to_dataframe()
    df = df.dropna().assign(model=model).reset_index()
    #df = df.groupby(["time", "step", "latitude", "longitude", "valid_time"]).agg({"t2m":"mean", "tprate":"mean"}).dropna().assign(model=model).reset_index()
    list_hindcasts.append(df)
hindcasts = pd.concat(list_hindcasts, axis=0, ignore_index=False)

In [6]:
hindcasts

Unnamed: 0,number,time,step,latitude,longitude,surface,valid_time,t2m,tprate,model
0,0,1993-04-01,30 days,-25.0,-69.0,0.0,1993-05-01,283.522827,2.750266e-10,CMCC
1,0,1993-04-01,30 days,-25.0,-68.0,0.0,1993-05-01,279.689941,3.969913e-10,CMCC
2,0,1993-04-01,30 days,-25.0,-67.0,0.0,1993-05-01,279.902527,2.366786e-09,CMCC
3,0,1993-04-01,30 days,-25.0,-66.0,0.0,1993-05-01,283.908844,1.698170e-08,CMCC
4,0,1993-04-01,30 days,-25.0,-65.0,0.0,1993-05-01,287.464691,3.244487e-08,CMCC
...,...,...,...,...,...,...,...,...,...,...
326035,9,2022-12-01,182 days,-43.0,-61.0,0.0,2023-06-01,283.104095,1.536934e-08,ECCC_operational
326036,9,2022-12-01,182 days,-43.0,-60.0,0.0,2023-06-01,282.422485,1.436804e-08,ECCC_operational
326037,9,2022-12-01,182 days,-43.0,-59.0,0.0,2023-06-01,281.866699,1.289770e-08,ECCC_operational
326038,9,2022-12-01,182 days,-43.0,-58.0,0.0,2023-06-01,281.725250,1.407527e-08,ECCC_operational


### 2. Basic Preprocessing

In [7]:
hindcasts_copy = hindcasts.copy()

In [12]:
hindcasts = hindcasts_copy.copy()

In [13]:
# Uniform names for ECCC forecasts
hindcasts.loc[hindcasts["model"] == "ECCC_operational", "model"] = "ECCC"
# Seconds per day
hindcasts["tprate"] = hindcasts["tprate"] * 86400  
# Kelvin to Celsius
hindcasts["t2m"] = hindcasts["t2m"].apply(lambda x: x - 273.15)
# Convert to datetime object
hindcasts["time"] = pd.to_datetime(hindcasts["time"])
hindcasts["valid_time"] = pd.to_datetime(hindcasts["valid_time"])
# For JMA hindcasts that are initialized at the end of the month we bring them to start date as the first day of the next month
hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"] = hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"].apply(lambda x: x.replace(day=1) + relativedelta(months=1))
# Filter init days that are not at the beginning of the month
hindcasts = hindcasts.loc[(hindcasts["time"].dt.day == 1)].reset_index(drop=True)
# Derive year and month
hindcasts["init_month"] = hindcasts["time"].dt.month
hindcasts["forecasted_month"] = hindcasts["valid_time"].dt.month - 1
hindcasts["year"] = hindcasts["time"].dt.year
hindcasts["forecasted_year"] = hindcasts["valid_time"].dt.year
# Filter
hindcasts = hindcasts.loc[(hindcasts["year"] == hindcasts["forecasted_year"])
                          & (hindcasts["init_month"].between(6,11)) & (hindcasts["forecasted_month"].between(8,11))].reset_index(drop=True)
# Rename and reorder
hindcasts = hindcasts.rename(columns={"latitude":"lat", "longitude":"lon", "t2m":"tmean", "tprate":"rain"})
hindcasts = hindcasts[["model", "year", "init_month", "number", "forecasted_month", "rain", "tmean", "lat", "lon"]]

In [14]:
hindcasts.shape

(14255956, 9)

In [16]:
hindcasts.groupby("model")["number"].nunique()

model
CMCC     40
DWD      30
ECCC     10
ECMWF    25
JMA       5
METFR    25
UKMO      7
Name: number, dtype: int64

In [10]:
hindcasts.groupby(["model", "init_month"])["forecasted_month"].apply(set)

model  init_month
CMCC   6             {8, 9, 10, 11}
       7             {8, 9, 10, 11}
       8             {8, 9, 10, 11}
       9                {9, 10, 11}
       10                  {10, 11}
       11                      {11}
DWD    6             {8, 9, 10, 11}
       7             {8, 9, 10, 11}
       8             {8, 9, 10, 11}
       9                {9, 10, 11}
       10                  {10, 11}
       11                      {11}
ECCC   6             {8, 9, 10, 11}
       7             {8, 9, 10, 11}
       8             {8, 9, 10, 11}
       9                {9, 10, 11}
       10                  {10, 11}
       11                      {11}
ECMWF  6             {8, 9, 10, 11}
       7             {8, 9, 10, 11}
       8             {8, 9, 10, 11}
       9                {9, 10, 11}
       10                  {10, 11}
       11                      {11}
JMA    6             {8, 9, 10, 11}
       7             {8, 9, 10, 11}
       8             {8, 9, 10, 11}
       9  

In [26]:
hindcasts.sample(5)

Unnamed: 0,model,year,init_month,forecasted_month,rain,tmean,lat,lon
72099,CMCC,2013,8,11,0.001384,11.400018,-27.0,-68.0
97644,ECMWF,1996,10,10,0.000591,16.902246,-30.0,-68.0
119072,ECMWF,2002,10,11,0.001585,19.008936,-40.0,-64.0
156822,ECMWF,2013,9,10,0.00307,14.394434,-28.0,-66.0
13739,CMCC,1996,10,11,0.001625,17.496545,-37.0,-58.0


## EXPORT

In [17]:
hindcasts.to_csv("data/c3s_hindcasts_ensembles.csv", index=False)