In [1]:
import os
import pandas as pd
import xarray as xr

month_name_to_number = {
    "JAN":1, "FEB":2, "MAR":3, "APR":4, "MAY":5, "JUN":6, "JUL":7, "AUG":8, "SEP":9, "OCT":10, "NOV":11, "DEC":12
    }

%load_ext autoreload
%autoreload 2

### 1. Read data

### APCC CLIK

The [Climate Information Toolkit (CLIK) from the APEC Climate Center (APCC)](https://cliks.apcc21.org/)(CLIK) consists of fifteen Seasonal climate models whose hindcasts are available to download [here](https://cliks.apcc21.org/dataset/model). Alternatively, CLIK has an API that we used in our research. More information is available [here](https://cliks.apcc21.org/contents/openapi). Out of the fifteen models that are part of the APCC MME, we collected data from five. Partly because some data was not consistently available for all years and months of initialization, and partly because the data was already collected from the NMME or the Copernicus Climate Data Store (more in *../c3s/* or *../nmme/*)

- BoM ACCESS-S2 from Australia - **BOM**
- PNU-RDA CGCMv2.0 from Korea - **PNU**
- CWB TCWB1Tv1.1 from Chinese Taipei - **CWB**
- HMC SL-AV from Russia - **HMC**
- KMA GloSea6GC3.2 from Korea - **KMA**
- NCEP CFSv2 from the USA - **collected from NMME**
- METFR SYS8 from France - **collected from C3S**
- CMCC SOS3.5 from Italy - **collected from C3S**
- UKMO GloSea6 from the UK - **collected from C3s**
- ECCC CANSIPSv2.1 from Canada - **collected from C3S**
- APCC SCoPS from Korea - **not considered, because of missing data**
- BCC CSM1.1m from China - **not considered, because of missing data**
- MGO MGOAM-2 grom Russia - **not considered, because of missing data**


In [2]:
def read_data(model_name_as_string):
    """Read netcdf4 data files and concatenates them to one dataframe.
    Params:
     - model_name_as_string: string, the name of the model
     Returns:
      - hindcasts: dataframe
    """
    list_temp = []
    list_prec = []
    climate_vars = ["prec", "t2m"]
    directory_as_str = "data/" + model_name_as_string + "/"
    directory = os.fsencode(directory_as_str)
    # to print progress of reading files
    total_number_of_files = len([entry for entry in os.listdir(directory_as_str) if os.path.isfile(os.path.join(directory_as_str, entry))])
    print("job started, there are {} files to iterate".format(total_number_of_files))  
    counter = 0
    for file in os.listdir(directory):
        
        counter += 1
        filename = os.fsdecode(file)
        if filename.endswith("nc"):
            # info about hindcasts are part of the filename
            splitted_filename = filename.split("_") 
            init_month = splitted_filename[-3]
            year = int(splitted_filename[-2])
            model_year = splitted_filename[0][-4:]
            df = (xr
                  .open_dataset(directory_as_str + filename, engine="netcdf4")
                  .to_dataframe()
                  .reset_index()
                  .assign(model=model_name_as_string, model_year=model_year, init_year=year, 
                          init_month=init_month, forecasted_year=lambda x: pd.to_datetime(x["time"]).dt.year, forecasted_month=lambda x: pd.to_datetime(x["time"]).dt.month)
                  .groupby(["model", "model_year", "init_year", "init_month", "forecasted_year", "forecasted_month", "time", "lat", "lon"])
                  .mean())
            df = df.loc[:, [c for c in df.columns if c in climate_vars]]
            if filename.endswith("prec.nc"): list_prec.append(df)
            if filename.endswith("t2m.nc"): list_temp.append(df)
        if counter % 100 == 0:
            print(str(counter) + "/" + str(total_number_of_files))
    
    print("concatenating temp and rain..")        
    hindcasts_prec = pd.concat(list_prec, axis=0, ignore_index=False)
    hindcasts_temp = pd.concat(list_temp, axis=0, ignore_index=False)
    hindcasts = hindcasts_prec.combine_first(hindcasts_temp).reset_index()
    print("job finished")
    return hindcasts
    

In [3]:
li = []
for model in ["PNU", "KMA", "CWB", "HMC", "BOM"]:
    print(model)
    scm = read_data(model)
    li.append(scm)
hindcasts = pd.concat(li, ignore_index=True)

PNU
job started, there are 577 files to iterate
        time  level   lat   lon  prec model model_year  init_year init_month  \
0 1993-11-01      1 -90.0   0.0   0.0   PNU       2021       1993        NOV   
1 1993-11-01      1 -90.0   2.5   0.0   PNU       2021       1993        NOV   
2 1993-11-01      1 -90.0   5.0   0.0   PNU       2021       1993        NOV   
3 1993-11-01      1 -90.0   7.5   0.0   PNU       2021       1993        NOV   
4 1993-11-01      1 -90.0  10.0   0.0   PNU       2021       1993        NOV   

   forecasted_year  forecasted_month  
0             1993                11  
1             1993                11  
2             1993                11  
3             1993                11  
4             1993                11  
        time  level   lat   lon         t2m model model_year  init_year  \
0 1993-11-01      1 -90.0   0.0  238.333328   PNU       2021       1993   
1 1993-11-01      1 -90.0   2.5  238.333328   PNU       2021       1993   
2 1993-11-01

KeyboardInterrupt: 

In [4]:
hindcasts.sample(5)

Unnamed: 0,model,model_year,init_year,init_month,forecasted_year,forecasted_month,time,lat,lon,prec,t2m
78244244,BOM,2022,2002,MAY,2002,8,2002-08-01,-32.5,290.0,2.687351,273.738892
16084713,PNU,2022,2008,MAY,2008,5,2008-05-01,-67.5,142.5,2.101794,246.68544
62377681,HMC,2022,2005,OCT,2005,12,2005-12-01,82.5,122.5,0.23364,241.539185
11805917,PNU,2022,2012,JUL,2012,8,2012-08-01,-75.0,192.5,1.434064,249.221481
37004903,CWB,2021,2003,JUL,2003,11,2003-11-01,-45.0,177.5,2.382907,283.722626


In [5]:
hindcasts_copy = hindcasts.copy()

In [18]:
hindcasts = hindcasts_copy.copy()

### 2. Basic Preprocessing

In [19]:
# change coordinate reference units
hindcasts["lon"] = hindcasts["lon"].apply(lambda x: x - 360 if x > 180 else x)
# filter roughly by relevant area of Argentina
hindcasts = hindcasts.loc[(hindcasts["lat"] <= -28) & (hindcasts["lat"] >= -45) & (hindcasts["lon"] >= -68) & (hindcasts["lon"] <= -55)].reset_index(drop=True)
# replace month names to integer
hindcasts["init_month"] = hindcasts["init_month"].replace(month_name_to_number)
# convert tempearture from Kelvin to degree celsius
hindcasts["t2m"] = hindcasts["t2m"].apply(lambda x: x - 273.15)
# convert rain to mm to match with ERA-reanalysis
hindcasts["prec"] = hindcasts["prec"].apply(lambda x: x / 1000)
# filter by init month and forecasted month; HMC is not available for init_month < 9, but we need 8 as feature, will be supplemented with ERA and filtered later
hindcasts = (hindcasts
             .loc[(hindcasts["init_month"].between(6, 11)) & (hindcasts["forecasted_month"].between(9, 11)) & (hindcasts["forecasted_year"] == hindcasts["init_year"]),
                  ["model", "init_year", "init_month", "forecasted_month", "prec", "t2m", "lat", "lon"]]
                .rename(columns={"init_year":"year", "prec":"rain", "t2m":"tmean"})
                .reset_index(drop=True))

In [20]:
hindcasts.groupby(["model", "init_month"])["forecasted_month"].apply(set)

model  init_month
BOM    6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
CWB    6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
HMC    7                     {9}
       8                 {9, 10}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
KMA    6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
PNU    6             {9, 10, 11}
       7             {9, 10, 11}
       8             {9, 10, 11}
       9             {9, 10, 11}
       10               {10, 11}
       11                   {11}
Name: forecasted_month, d

## EXPORT

In [21]:
hindcasts.to_csv("data/apcc_hindcasts.csv", index=False)