In [1]:
import pandas as pd
import xarray as xr
from dateutil.relativedelta import relativedelta

%load_ext autoreload
%autoreload 2

### 1. Read data

In [10]:
# read hindcasts
list_hindcasts = []
for model in ["CMCC", "DWD", "ECMWF", "JMA", "METFR", "UKMO", "ECCC", "ECCC_operational"]:
    df = xr.open_dataset("data/" + model + "/" + model.lower() + ".grib").to_dataframe()
    df = df.groupby(["time", "step", "latitude", "longitude", "valid_time"]).agg({"t2m":"mean", "tprate":"mean"}).dropna().assign(model=model).reset_index()
    list_hindcasts.append(df)
hindcasts = pd.concat(list_hindcasts, axis=0, ignore_index=False)

### 2. Basic Preprocessing

In [11]:
hindcasts_copy = hindcasts.copy()

In [12]:
# Uniform names for ECCC forecasts
hindcasts.loc[hindcasts["model"] == "ECCC_operational", "model"] = "ECCC"
# Seconds per day
hindcasts["tprate"] = hindcasts["tprate"] * 86400  
# Kelvin to Celsius
hindcasts["t2m"] = hindcasts["t2m"].apply(lambda x: x - 273.15)
# Convert to datetime object
hindcasts["time"] = pd.to_datetime(hindcasts["time"])
hindcasts["valid_time"] = pd.to_datetime(hindcasts["valid_time"])
# For JMA hindcasts that are initialized at the end of the month we bring them to start date as the first day of the next month
hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"] = hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"].apply(lambda x: x.replace(day=1) + relativedelta(months=1))
# Filter init days that are not at the beginning of the month
hindcasts = hindcasts.loc[(hindcasts["time"].dt.day == 1)].reset_index(drop=True)
# Derive year and month
hindcasts["init_month"] = hindcasts["time"].dt.month
hindcasts["forecasted_month"] = hindcasts["valid_time"].dt.month - 1
hindcasts["year"] = hindcasts["time"].dt.year
hindcasts["forecasted_year"] = hindcasts["valid_time"].dt.year
# Filter
hindcasts = hindcasts.loc[(hindcasts["year"] == hindcasts["forecasted_year"])
                          & (hindcasts["init_month"].between(6,11)) & (hindcasts["forecasted_month"].between(9,11))].reset_index(drop=True)
# Rename and reorder
hindcasts = hindcasts.rename(columns={"latitude":"lat", "longitude":"lon", "t2m":"tmean", "tprate":"rain"})
hindcasts = hindcasts[["model", "year", "init_month", "forecasted_month", "rain", "tmean", "lat", "lon"]]

In [13]:
hindcasts.shape

(565245, 8)

In [17]:
hindcasts.groupby(["model", "init_month"])["forecasted_month"].apply(set)

model  init_month
CMCC   6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
DWD    6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
ECCC   6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
ECMWF  6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
JMA    6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                

In [18]:
hindcasts.sample(5)

Unnamed: 0,model,year,init_month,forecasted_month,rain,tmean,lat,lon
275560,JMA,2003,10,11,0.000429,16.215387,-39.5,-69.0
52585,CMCC,2007,11,11,0.000486,13.835321,-29.0,-69.0
333343,METFR,2001,9,9,0.00219,9.544641,-40.0,-58.0
110196,DWD,2000,7,11,0.004868,23.161401,-26.0,-61.0
206422,ECMWF,2003,10,10,0.001456,12.639337,-40.0,-61.0


## EXPORT

In [19]:
hindcasts.to_csv("data/c3s_hindcasts.csv", index=False)