In [1]:
# Libraries
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import helperfunctions as hf
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2 
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold
from IPython.core.interactiveshell import InteractiveShell

# Notebook Settings 
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("error")
pd.set_option('display.max_columns', 500)
InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2

# Variables
crop_seasons = list(range(1993,2017))
months_of_crop_season = list(range(4,12))
homogeneous_groups = list(range(1,5))

## Content
* [1. Read Data](#read_data)
* [2. Bias-Adjustment](#bias_adjustment)
* [3. Dataset Completion](#dataset_completion)
* [4. Feature Preparation](#feature_preparation)
* [5. Include Yield Data](#yield_data)
* [6. K-Fold Cross Validation](#cross_validation)
* [7. Visualization](#visualization)

## Abstract

The amount of wheat imports to Brazil depend on its national production. Wheat production failures and price increases can be anticipated before harvest with wheat yield forecast systems. The goal of this study was to analyze how seasonal climatology_copy models can be useful to forecast national trend-corrected wheat yield in Brazil. We used monthly aggregated temperature and precipitation data during the last three months of the wheat growing season from August to October to train a ridge regression model. The studied climatology_copy models were from the European Center for Medium-Range Weather Forecasts (ECMWF), the National Centers for Environmental Prediction (NCEP), and the UK-based Met Office (UKMO). We further tested a multi-model ensembles (MME) approach of the three climate models as well as a climatological approach, whose forecasts for each site, and month is based on historical climate. For every season from 1993-2016, we initialized forecasts at the beginning of each month during the crop season from April to October. From August on, we supplemented climate features from past months with observations. Wheat yield anomaly forecasts improved towards harvest. For some models and month of forecast initialization, the climatological approach performed better than single seasonal climate models. However, the multi-model ensemble as well as the ECMWF-based wheat yield forecasts consistently outperformed climatology and two months before harvest wheat yield could be forecasted with 9%, 9.1%, and 10.1% RMSE respectively. Our results demonstrated that seasonal climate models and their aggregation to multi-model ensembles provide valuable information to anticipate possible wheat production shortcomings in Brazil.

## 1. Read Data <a name="read_data"></a>

Our approach requires three sources of climate data: seasonal climate models (hindcasts), observations, and climatology.
- **hindcasts**: There are three seasonal climate models that we requested data from: ECMWF, UKMO, NCEP. We also computed an unweighted average of the outputs of the three climatology_copy models to have a multi-model ensemble output (MME). We requested retrospective seasonal climatology_copy forecasts, called hindcasts from 1993 to 2016 for four locations (zones) in Brazil. The locations were selected based on the findings from Nóia Júnior et al. ([2021](https://iopscience.iop.org/article/10.1088/1748-9326/ac26f3)). For each model, year, and location, we requested seven hindcasts, initialized at the beginning of each month during the wheat growing season from April to October and forecasting precipitation and temperature data until the end of the season.
- **observations**: We also need climate observations from the same four locations ([Nóia Júnior et al., 2021](https://iopscience.iop.org/article/10.1088/1748-9326/ac26f3)) from 1993 to 2016 along the wheat growing season from April to October. This data is used for bias-adjustment of the hindcasts but also for the wheat yield forecast model. When a forecast is provided in month *m*, climate features from past month are supplemented with climate observations, while future months are based on forecasted climate features. Additionally, we need climate observations to calculate expected, *normal*, climate conditions (climatology) to benchmark our approach with. 
- **climatology**: For each location, month, climate variable, and year *y*, we compute the average from observations from the same location, month, climate variable, and all other years except year *y* from 1993 to 2016.

In [2]:
hindcasts = hf.read_raw_model_data() # 1993-2016
observations = hf.read_observed_weather() # 1993-2016
climatology = hf.create_climatology_data(observations) # Leave-One-Out 1993-2016

observations = observations.loc[("WS", 11, [1, 2, 3, 4], list(range(1993, 2017)))] # 1993-2016

hindcasts.head(1)
observations.head(1)
climatology.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time,tmean,tmax,tmin,rain
model,init_month,zone,year,month,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ECMWF,4,1,1993,8,1993-08-01,10.479694,16.949674,6.975128,2.893066


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time,tmean,tmax,tmin,rain
model,init_month,zone,year,month,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WS,11,1,1993,8,1993-08-01,5.9,9.8,2.0,0.0


Unnamed: 0,zone,year,month,tmean,tmax,tmin,rain
0,1,1993,8,15.432539,20.681767,10.18331,156.791304


## 2. Bias-Adjustment <a name="bias_adjustment"></a>

Biases are systematic errors between forecasts and observations that come from inaccuracies in the model design and the sensitivity of climate models to initial conditions (see, e.g. [ECMWF-Wiki](https://confluence.ecmwf.int/display/CKB/Seasonal+forecasts+and+the+Copernicus+Climate+Change+Service)). We use [scaled (normal) distribution mapping](https://hess.copernicus.org/articles/21/2649/2017/) to adjust biases in forecasted daily mean, maximum, and minimum air temperature. We do not apply any bias adjustment to rain forecasts as it did not lead to improvements in mean absolut error. We adjust temperature values by *model*, *init_month*, *zone*, and *month* for each year *y* using observations and hindcasts from all other years (Leave-One-Out) to avoid overfitting.

In [3]:
hindcasts_temp_adjusted = hf.adjust_mean_temperature_bias(observations, hindcasts)

## 3. Dataset Completion <a name="dataset_completion"></a>

We need monthly climate features for August, September, and October. Hindcasts that are initialized between April and July provide forecasts over the entire relevant period from August to October. Hindcasts that are initialized later, e.g. in September, need to be supplemented with climate observations for days in the relevant period that are in the past, e.g. August.

In [4]:
hindcast_complete = hf.fill_missing_dates_with_observations(observations, hindcasts_temp_adjusted) 

Validation that for each *model*, *init_month*, *zone*, and *year* we have the same number of observations: 31 days for Aug + 30 days for Sept + 31 days for Oct.

In [5]:
hindcast_complete.reset_index().groupby(["model", "init_month", "zone", "year"]).size().unique()

array([92], dtype=int64)

We concatenate the hindcasted daily values with the fully observed daily values.

In [6]:
climate_records_complete = pd.concat([hindcast_complete, observations]).sort_index()

## 4. Feature Preparation <a name="feature_preparation"></a>

### 4.1 Feature Computation 

From the daily values we calculate monthly climate indices.

In [7]:
features_hindcasts_observations = hf.aggregate_data(climate_records_complete)
features_hindcasts_observations.head(2)

Unnamed: 0,model,init_month,zone,year,Tmean_Aug,Tmean_Oct,Tmean_Sep,Tmax_Aug,Tmax_Oct,Tmax_Sep,Tmin_Aug,Tmin_Oct,Tmin_Sep,Rain_Aug,Rain_Oct,Rain_Sep
0,ECMWF,4,1,1993,11.848188,15.432356,13.875417,18.380468,22.14906,20.741017,8.416167,11.92098,10.31628,165.319824,223.339844,177.917481
1,ECMWF,4,1,1994,11.832097,15.305648,13.435821,18.531746,22.15192,20.155681,8.522847,11.634992,9.9352,168.4375,221.113281,203.830566


We include climatology features.

In [8]:
features_climatology = hf.create_climatology_features(features_hindcasts_observations, climatology)
features_complete = (pd
                     .concat([features_hindcasts_observations, features_climatology])
                     .sort_values(["model", "init_month", "zone", "year"])
                     .drop_duplicates()
                     .reset_index(drop=True))

### 4.2 Remove Correlated Features

We will fit our model on observed monthly climate features from August to October. Features need to be uncorrelated, which is why we will drop columns that have a correlation coefficient higher than 0.9.

In [9]:
relevant_columns = ['Tmean_Aug', 'Tmean_Oct', 'Tmean_Sep', 'Tmax_Aug', 'Tmax_Oct', 'Tmax_Sep', 'Tmin_Aug', 'Tmin_Oct', 'Tmin_Sep', 'Rain_Aug', 'Rain_Oct', 'Rain_Sep']
values = features_complete.loc[(features_complete["model"] == "WS"), relevant_columns]
cor_matrix = values.corr().abs().round(2)
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
print(to_drop)

features_complete = features_complete.drop(to_drop, axis=1)

['Tmax_Aug', 'Tmax_Oct', 'Tmax_Sep', 'Tmin_Aug', 'Tmin_Oct', 'Tmin_Sep']


In [10]:
features_complete.sample(5)

Unnamed: 0,model,init_month,zone,year,Tmean_Aug,Tmean_Oct,Tmean_Sep,Rain_Aug,Rain_Oct,Rain_Sep
1850,MME,7,2,1995,19.485332,22.277185,21.009623,42.82035,159.12597,88.610565
2518,NCEP,6,1,2015,13.48743,17.349505,15.295251,146.103668,194.333557,214.945923
682,CLIMATE,11,1,2003,13.309677,19.114516,16.705,57.5,237.1,64.0
2195,MME,10,4,2004,16.171546,17.540893,18.783163,16.7,154.267842,81.4
2408,NCEP,5,1,2001,14.245824,16.7335,15.753814,114.722015,161.222526,149.536979


It is not surprising that Tmax and Tmin features highly correlate with Tmean and are therefore dropped.

### 4.3 Unstack Features by Zone

Previously, in the approach of [Nóia Júnior et al., 2021](https://iopscience.iop.org/article/10.1088/1748-9326/ac26f3), separate models were trained for each location (agro-climatic homogeneous groups) and their estimates where extrapolated to national level using harvested area estimates for each group. We now choose a different approach, where we directly estimate national wheat yield and the model can decide which location and climate feature it can assign more importance to. We simply need to unstack the *zone* column. The feature names will now hold an additional suffix *_n*, where *n* ranges from 1 to 4, indicating the location where that climate feature belongs to.

In [11]:
features_complete_unstacked = features_complete.set_index(["zone", "model", "init_month", "year"]).unstack(0)
features_complete_unstacked.columns = [str(s[0]) + "_" + str(s[1]) for s in features_complete_unstacked.columns]
features_complete_unstacked = features_complete_unstacked.reset_index()
features_complete_unstacked.sample(5)

Unnamed: 0,model,init_month,year,Tmean_Aug_1,Tmean_Aug_2,Tmean_Aug_3,Tmean_Aug_4,Tmean_Oct_1,Tmean_Oct_2,Tmean_Oct_3,Tmean_Oct_4,Tmean_Sep_1,Tmean_Sep_2,Tmean_Sep_3,Tmean_Sep_4,Rain_Aug_1,Rain_Aug_2,Rain_Aug_3,Rain_Aug_4,Rain_Oct_1,Rain_Oct_2,Rain_Oct_3,Rain_Oct_4,Rain_Sep_1,Rain_Sep_2,Rain_Sep_3,Rain_Sep_4
156,CLIMATE,10,2005,16.017742,20.809677,18.708065,17.596046,18.857013,23.556821,22.197439,19.201995,13.113333,20.008333,17.315,15.364715,135.4,37.3,35.5,100.1,236.726087,150.086957,177.56087,176.743478,152.7,123.8,146.1,261.6
376,ECMWF,11,2009,15.733871,19.304839,18.072581,16.624581,18.714516,22.880645,22.187097,18.703367,15.351667,22.415,20.671667,17.003372,268.8,90.6,88.1,99.6,133.9,364.1,334.4,171.3,489.7,237.9,197.5,269.1
924,UKMO,10,2005,16.017742,20.809677,18.708065,17.596046,18.431897,24.028139,22.991742,18.961076,13.113333,20.008333,17.315,15.364715,135.4,37.3,35.5,100.1,283.105397,197.166809,239.043831,256.706256,152.7,123.8,146.1,261.6
450,MME,6,2011,13.532881,18.157383,17.155087,13.941469,16.414636,21.527497,20.595006,16.822139,14.395448,19.573224,18.622655,14.916209,115.594608,69.290643,94.624727,88.225899,219.711371,162.460547,197.809613,196.128569,160.789391,119.26653,166.437265,158.051086
911,UKMO,9,2016,14.635484,20.280645,18.133871,14.746774,19.491008,24.16387,23.382287,19.407633,15.751139,21.629253,20.471574,17.095241,190.2,123.0,199.7,161.8,267.218387,166.763167,193.377145,164.835844,173.992098,111.976671,154.37993,172.605661


## 5. Include Yield Data <a name="yield_data"></a>

We will now read the national detrended wheat yield data to be merged with our feature dataset. The wheat yield data was obtained from the [Brazilian Institute of Geography and Statistics](https://sidra.ibge.gov.br/tabela/1612). For more information on the data, see the other notebook *prepare_wheat_data*.

In [12]:
yield_national = hf.read_national_wheat_yield()

Our final dataset:

In [13]:
kfold_cv_dataset = (features_complete_unstacked
                    .merge(yield_national, how="left", on="year")
                    .dropna()
                    .reset_index(drop=True))

In [14]:
kfold_cv_dataset.sample(3)

Unnamed: 0,model,init_month,year,Tmean_Aug_1,Tmean_Aug_2,Tmean_Aug_3,Tmean_Aug_4,Tmean_Oct_1,Tmean_Oct_2,Tmean_Oct_3,Tmean_Oct_4,Tmean_Sep_1,Tmean_Sep_2,Tmean_Sep_3,Tmean_Sep_4,Rain_Aug_1,Rain_Aug_2,Rain_Aug_3,Rain_Aug_4,Rain_Oct_1,Rain_Oct_2,Rain_Oct_3,Rain_Oct_4,Rain_Sep_1,Rain_Sep_2,Rain_Sep_3,Rain_Sep_4,yield
642,NCEP,6,2011,13.533481,17.180708,16.031083,13.889941,16.502232,21.157502,19.938416,17.123404,14.390471,18.848024,17.791263,14.876716,104.285882,63.484211,88.747475,90.329994,252.688614,160.153617,190.962006,240.562424,124.04934,142.731308,201.381919,210.774971,2914.522173
438,MME,6,1999,12.524866,18.026112,16.988745,13.592803,16.248713,21.761146,20.946351,16.77001,14.662951,20.833218,19.76349,15.694904,153.237974,68.262855,88.283095,104.068462,198.43538,166.676371,186.343173,177.241547,203.354622,99.142542,140.43465,159.879062,2881.805551
372,ECMWF,11,2005,16.017742,20.809677,18.708065,17.596046,18.898387,24.012903,22.332258,19.154852,13.113333,20.008333,17.315,15.364715,135.4,37.3,35.5,100.1,384.8,258.1,374.1,244.2,152.7,123.8,146.1,261.6,2531.334443


### Summary of our dataset

We are now finished with the preprocessing. Let's quickly summarize the data that we will train our model on. 
- There are 24 years, from 1993 to 2016
- For each year, we have 6 different model sources: ECMWF, NCEP, UKMO, MME, CLIMATE, WS (observations)
- WS has one data point per year, the other models have eight data points, one for each month of initialization from April to November
- This gives us 24 * (5 * 8 + 1) = 984 data points

In [15]:
kfold_cv_dataset.shape

(984, 28)

In [16]:
#kfold_cv_dataset.to_csv("kfold_cv_dataset.csv", index=False)

In [20]:
kfold_cv_dataset = pd.read_csv("kfold_cv_dataset.csv")

## 6. K-Fold Cross Validation <a name="cross_validation"><a/>

In [21]:
models = ["ECMWF", "NCEP", "UKMO", "MME", "CLIMATE", "WS"]
results = pd.DataFrame(0, index=models, columns=months_of_crop_season)
for im in months_of_crop_season:
    for model in models:
        res = hf.kfold_cross_validation(kfold_cv_dataset, model=model, init=im, no_of_features=8)
        # rmse
        metric = 100 * mse(res["yield"], res["predicted"], squared=False)/(res["yield"].mean())
        # coefficient of determination
        #metric = r2(res["yield"], res["predicted"])
        results.loc[model, im] = np.round(metric, 2)

In [19]:
results # coefficient of determination with 8 feature

Unnamed: 0,4,5,6,7,8,9,10,11
ECMWF,28.47,28.63,29.41,29.38,28.33,17.26,13.15,6.01
NCEP,27.62,27.45,24.13,25.42,20.32,12.95,9.5,6.01
UKMO,12.4,12.74,11.56,12.37,10.67,10.59,9.19,6.01
MME,21.53,21.55,20.7,21.11,17.84,11.88,8.2,6.01
CLIMATE,11.75,11.75,11.75,11.75,11.75,10.27,9.13,6.01
WS,6.01,6.01,6.01,6.01,6.01,6.01,6.01,6.01


In [22]:
results # coefficient of determination with 8 feature

Unnamed: 0,4,5,6,7,8,9,10,11
ECMWF,11.56,11.9,11.23,11.44,10.58,9.49,7.88,6.01
NCEP,14.34,14.88,14.7,12.74,11.84,10.45,9.26,6.01
UKMO,12.28,12.71,10.79,12.13,11.07,10.61,7.64,6.01
MME,12.12,12.47,11.76,11.62,10.47,9.86,7.9,6.01
CLIMATE,11.75,11.75,11.75,11.75,11.75,10.27,9.13,6.01
WS,6.01,6.01,6.01,6.01,6.01,6.01,6.01,6.01
