In [2]:
import os
import pandas as pd
import xarray as xr

%load_ext autoreload
%autoreload 2

month_name_to_number = {"JAN":1, "FEB":2, "MAR":3, "APR":4, "MAY":5, "JUN":6, "JUL":7, "AUG":8, "SEP":9, "OCT":10, "NOV":11, "DEC":12}

In [3]:
hindcast_coordinates = pd.read_csv("data/hindcast_coordinate_pairs_filtered.csv", index_col=["lat", "lon"])

In [16]:
def read_data(model_name_as_string):
    """Read netcdf4 data files and concatenates them to one dataframe.
    Params:
     - model_name_as_string: string, the name of the model
     
     Returns:
      - hindcasts: dataframe
    """
    li_temp = []
    li_prec = []
    climate_vars = ["prec", "t2m"]
    directory_as_str = "data/" + model_name_as_string + "/"
    directory = os.fsencode(directory_as_str)
    # to print progress of reading files
    total_number_of_files = len([entry for entry in os.listdir(directory_as_str) if os.path.isfile(os.path.join(directory_as_str, entry))])
    counter = 0
    for file in os.listdir(directory):
        counter += 1
        filename = os.fsdecode(file)
        if filename.endswith("nc"):
            # info about hindcasts are part of the filename
            splitted_filename = filename.split("_") 
            init_month = splitted_filename[-3]
            year = int(splitted_filename[-2])
            model_year = splitted_filename[0][-4:]
            df = (xr
                  .open_dataset(directory_as_str + filename, engine="netcdf4")
                  .to_dataframe()
                  .reset_index()
                  .assign(model=model_name_as_string, model_year=model_year, init_year=year, 
                          init_month=init_month, forecasted_year=lambda x: pd.to_datetime(x["time"]).dt.year, forecasted_month=lambda x: pd.to_datetime(x["time"]).dt.month)
                  .groupby(["model", "model_year", "init_year", "init_month", "forecasted_year", "forecasted_month", "time", "lat", "lon"])
                  .mean())
            df = df.loc[:, [c for c in df.columns if c in climate_vars]]
            if filename.endswith("prec.nc"): li_prec.append(df)
            if filename.endswith("t2m.nc"): li_temp.append(df)
        print(str(counter) + "/" + str(total_number_of_files))
    
    print("concatenating temp and rain..")        
    hindcasts_prec = pd.concat(li_prec, axis=0, ignore_index=False)
    hindcasts_temp = pd.concat(li_temp, axis=0, ignore_index=False)
    hindcasts = hindcasts_prec.combine_first(hindcasts_temp).reset_index()
    print("job finished")
    return hindcasts
    

In [17]:
def basic_preprocessing(dataframe):
    """Perform basic preprocessing steps to raw hindcast data.
    Params:
     - dataframe: hindcast as a dataframe     
     Returns:
      - hindcasts: dataframe
    """
    dataframe = dataframe.copy()
    # init_month as integer instead of string
    dataframe["init_month"] = dataframe["init_month"].replace(month_name_to_number)
    # Change to spatial encoding
    dataframe["lon"] = dataframe["lon"].apply(lambda x: x - 360 if x > 180 else x)
    # Kelvin to Celsius
    dataframe["t2m"] = dataframe["t2m"].apply(lambda x: x - 273.15)
    # Only keep dataframe of most recent model version
    dataframe = (dataframe
                .sort_values(by=["model", "init_year", "init_month", "forecasted_year", "forecasted_month", "lat", "lon", "model_year"])
                .drop_duplicates(subset=["model", "init_year", "init_month", "forecasted_year", "forecasted_month", "lat", "lon"], keep="last")
                .set_index(["lat", "lon"]))
    # Filter for Argentina
    dataframe = dataframe.merge(hindcast_coordinates, left_index=True, right_index=True, how="inner")
    # Filter for crop calendar and reorder columns
    dataframe = dataframe.reset_index()[["model", "init_year", "init_month", "forecasted_year", "forecasted_month", "lat", "lon", "prec", "t2m"]]
    dataframe = (dataframe
                .loc[dataframe["init_month"].between(4,11) & dataframe["forecasted_month"].between(4,11) 
                     & dataframe["forecasted_year"].between(1993,2016) & (dataframe["forecasted_year"] == dataframe["init_year"])]
                .reset_index(drop=True)
                .round(2))
    
    return dataframe

In [18]:
df = read_data("PNU")

1/1488
2/1488
3/1488
4/1488
5/1488
6/1488
7/1488
8/1488
9/1488
10/1488
11/1488
12/1488
13/1488
14/1488
15/1488
16/1488


In [10]:
df_preprocessed = basic_preprocessing(df)

In [12]:
df_preprocessed

Unnamed: 0,model,init_year,init_month,forecasted_year,forecasted_month,lat,lon,prec,t2m
0,KMA,1993,4,1993,4,-40.0,-62.5,1.71,14.96
1,KMA,1993,4,1993,5,-40.0,-62.5,0.99,11.20
2,KMA,1993,4,1993,6,-40.0,-62.5,0.92,8.71
3,KMA,1993,4,1993,7,-40.0,-62.5,0.78,8.42
4,KMA,1993,4,1993,8,-40.0,-62.5,1.30,9.12
...,...,...,...,...,...,...,...,...,...
11875,KMA,2016,9,2016,10,-27.5,-62.5,4.38,22.05
11876,KMA,2016,9,2016,11,-27.5,-62.5,4.77,24.15
11877,KMA,2016,10,2016,10,-27.5,-62.5,3.12,22.34
11878,KMA,2016,10,2016,11,-27.5,-62.5,5.11,25.37


In [14]:
df_preprocessed.to_csv("data/PNU/pnu.csv", index=False)