In [2]:
import pandas as pd
import xarray as xr
from dateutil.relativedelta import relativedelta

%load_ext autoreload
%autoreload 2

### 1. Read data

In [20]:
# read hindcasts
list_hindcasts = []
for model in ["CMCC", "DWD", "ECMWF", "JMA", "METFR", "UKMO", "ECCC", "ECCC_operational"]:
    df = xr.open_dataset("data/" + model + "/" + model.lower() + ".grib").to_dataframe()
    df = df.groupby(["time", "step", "latitude", "longitude", "valid_time"]).agg({"t2m":"mean", "tprate":"mean"}).dropna().assign(model=model).reset_index()
    list_hindcasts.append(df)
hindcasts = pd.concat(list_hindcasts, axis=0, ignore_index=False)

In [21]:
hindcasts

Unnamed: 0,time,step,latitude,longitude,valid_time,t2m,tprate,model
0,1993-04-01,30 days,-25.0,-69.0,1993-05-01,283.522827,2.750266e-10,CMCC
1,1993-04-01,30 days,-25.0,-68.0,1993-05-01,279.689941,3.969913e-10,CMCC
2,1993-04-01,30 days,-25.0,-67.0,1993-05-01,279.902527,2.366786e-09,CMCC
3,1993-04-01,30 days,-25.0,-66.0,1993-05-01,283.908844,1.698170e-08,CMCC
4,1993-04-01,30 days,-25.0,-65.0,1993-05-01,287.464691,3.244487e-08,CMCC
...,...,...,...,...,...,...,...,...
539131,2016-11-01,181 days,-42.0,-61.0,2017-05-01,289.146973,1.972936e-08,ECMWF
539132,2016-11-01,181 days,-42.0,-60.0,2017-05-01,288.385254,1.584399e-08,ECMWF
539133,2016-11-01,181 days,-42.0,-59.0,2017-05-01,287.322754,1.307185e-08,ECMWF
539134,2016-11-01,181 days,-42.0,-58.0,2017-05-01,285.217285,9.535738e-09,ECMWF


### 2. Basic Preprocessing

In [22]:
hindcasts_copy = hindcasts.copy()

In [23]:
# Uniform names for ECCC forecasts
hindcasts.loc[hindcasts["model"] == "ECCC_operational", "model"] = "ECCC"
# Seconds per day
hindcasts["tprate"] = hindcasts["tprate"] * 86400  
# Kelvin to Celsius
hindcasts["t2m"] = hindcasts["t2m"].apply(lambda x: x - 273.15)
# Convert to datetime object
hindcasts["time"] = pd.to_datetime(hindcasts["time"])
hindcasts["valid_time"] = pd.to_datetime(hindcasts["valid_time"])
# For JMA hindcasts that are initialized at the end of the month we bring them to start date as the first day of the next month
hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"] = hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"].apply(lambda x: x.replace(day=1) + relativedelta(months=1))
# Filter init days that are not at the beginning of the month
hindcasts = hindcasts.loc[(hindcasts["time"].dt.day == 1)].reset_index(drop=True)
# Derive year and month
hindcasts["init_month"] = hindcasts["time"].dt.month
hindcasts["forecasted_month"] = hindcasts["valid_time"].dt.month - 1
hindcasts["year"] = hindcasts["time"].dt.year
hindcasts["forecasted_year"] = hindcasts["valid_time"].dt.year
# Filter
hindcasts = hindcasts.loc[(hindcasts["year"] == hindcasts["forecasted_year"])
                          & (hindcasts["init_month"].between(6,11)) & (hindcasts["forecasted_month"].between(9,11))].reset_index(drop=True)
# Rename and reorder
hindcasts = hindcasts.rename(columns={"latitude":"lat", "longitude":"lon", "t2m":"tmean", "tprate":"rain"})
hindcasts = hindcasts[["model", "year", "init_month", "forecasted_month", "rain", "tmean", "lat", "lon"]]

In [24]:
hindcasts.shape

(168480, 8)

In [25]:
hindcasts.groupby(["model", "init_month"])["forecasted_month"].apply(set)

model  init_month
CMCC   6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
ECMWF  6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
Name: forecasted_month, dtype: object

In [26]:
hindcasts.sample(5)

Unnamed: 0,model,year,init_month,forecasted_month,rain,tmean,lat,lon
72099,CMCC,2013,8,11,0.001384,11.400018,-27.0,-68.0
97644,ECMWF,1996,10,10,0.000591,16.902246,-30.0,-68.0
119072,ECMWF,2002,10,11,0.001585,19.008936,-40.0,-64.0
156822,ECMWF,2013,9,10,0.00307,14.394434,-28.0,-66.0
13739,CMCC,1996,10,11,0.001625,17.496545,-37.0,-58.0


## EXPORT

In [27]:
hindcasts.to_csv("data/c3s_hindcasts.csv", index=False)