In [1]:
import pandas as pd
import xarray as xr
from dateutil.relativedelta import relativedelta

%load_ext autoreload
%autoreload 2

### 1. Read data

In [7]:
# read hindcasts
list_hindcasts = []
for model in ["CMCC", "DWD", "ECMWF", "JMA", "METFR", "UKMO", "ECCC", "ECCC_operational"]:
    df = xr.open_dataset("data/" + model + "/" + model.lower() + ".grib").to_dataframe()
    df = df.groupby(["time", "step", "latitude", "longitude", "valid_time"]).agg({"t2m":"mean", "tprate":"mean"}).dropna().assign(model=model).reset_index()
    list_hindcasts.append(df)
hindcasts = pd.concat(list_hindcasts, axis=0, ignore_index=False)

### 2. Basic Preprocessing

In [9]:
hindcasts_copy = hindcasts.copy()

In [10]:
# Uniform names for ECCC forecasts
hindcasts.loc[hindcasts["model"] == "ECCC_operational", "model"] = "ECCC"
# Seconds per day
hindcasts["tprate"] = hindcasts["tprate"] * 86400  
# Kelvin to Celsius
hindcasts["t2m"] = hindcasts["t2m"].apply(lambda x: x - 273.15)
# Convert to datetime object
hindcasts["time"] = pd.to_datetime(hindcasts["time"])
hindcasts["valid_time"] = pd.to_datetime(hindcasts["valid_time"])
# For JMA hindcasts that are initialized at the end of the month we bring them to start date as the first day of the next month
hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"] = hindcasts.loc[(hindcasts["time"].dt.day.between(26, 31)) & (hindcasts["model"] == "JMA"), "time"].apply(lambda x: x.replace(day=1) + relativedelta(months=1))
# Filter init days that are not at the beginning of the month
hindcasts = hindcasts.loc[(hindcasts["time"].dt.day == 1)].reset_index(drop=True)
# Derive year and month
hindcasts["init_month"] = hindcasts["time"].dt.month
hindcasts["forecasted_month"] = hindcasts["valid_time"].dt.month - 1
hindcasts["year"] = hindcasts["time"].dt.year
hindcasts["forecasted_year"] = hindcasts["valid_time"].dt.year
# Filter
hindcasts = hindcasts.loc[(hindcasts["year"] == hindcasts["forecasted_year"])
                          & (hindcasts["init_month"].between(9,11)) & (hindcasts["forecasted_month"].between(9,11))].reset_index(drop=True)
# Rename and reorder
hindcasts = hindcasts.rename(columns={"latitude":"lat", "longitude":"lon", "t2m":"tmean", "tprate":"rain"})
hindcasts = hindcasts[["model", "year", "init_month", "forecasted_month", "rain", "tmean", "lat", "lon"]]

In [11]:
hindcasts.shape

(228606, 8)

In [13]:
hindcasts.sample(5)

Unnamed: 0,model,year,init_month,forecasted_month,rain,tmean,lat,lon
105652,JMA,1998,9,11,0.002804,21.355554,-32.0,-64.0
65012,DWD,2015,9,10,0.00735,21.106897,-28.0,-57.0
176528,UKMO,2008,9,11,0.003474,19.9255,-38.0,-65.0
11736,CMCC,2001,9,11,0.001847,13.888116,-40.0,-59.0
59046,DWD,2011,9,9,0.001452,3.903375,-36.0,-69.0


## EXPORT

In [15]:
hindcasts.to_csv("data/c3s_hindcasts.csv", index=False)