In [40]:
# Libraries
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import helperfunctions as hf
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2 
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold
from IPython.core.interactiveshell import InteractiveShell

# Notebook Settings 
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("error")
pd.set_option('display.max_columns', 500)
InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2

# Variables
crop_seasons = list(range(1993,2017))
months_of_crop_season = list(range(4,12))
homogeneous_groups = list(range(1,5))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Content
* [1. Read Data](#read_data)
* [2. Bias-Adjustment](#bias_adjustment)
* [3. Dataset Completion](#dataset_completion)
* [4. Feature Preparation](#feature_preparation)
* [5. Include Yield Data](#yield_data)
* [6. K-Fold Cross Validation](#cross_validation)
* [7. Visualization](#visualization)

## 1. Read Data <a name="read_data"></a>

Our approach requires three sources of climate data: seasonal climate models (hindcasts), observations, and climatology.
- **hindcasts**: There are three seasonal climate models that we requested data from: ECMWF, UKMO, NCEP. We also computed an unweighted average of the outputs of the three climatology_copy models to have a multi-model ensemble output (MME). We requested retrospective seasonal climatology_copy forecasts, called hindcasts from 1993 to 2016 for four locations (zones) in Brazil. The locations were selected based on the findings from Nóia Júnior et al. ([2021](https://iopscience.iop.org/article/10.1088/1748-9326/ac26f3)). For each model, year, and location, we requested seven hindcasts, initialized at the beginning of each month during the wheat growing season from April to October and forecasting precipitation and temperature data until the end of the season.
- **observations**: We also need climate observations from the same four locations ([Nóia Júnior et al., 2021](https://iopscience.iop.org/article/10.1088/1748-9326/ac26f3)) from 1993 to 2016 along the wheat growing season from April to October. This data is used for bias-adjustment of the hindcasts but also for the wheat yield forecast model. When a forecast is provided in month *m*, climate features from past month are supplemented with climate observations, while future months are based on forecasted climate features. Additionally, we need climate observations to calculate expected, *normal*, climate conditions (climatology) to benchmark our approach with. 
- **climatology**: For each location, month, climate variable, and year *y*, we compute the average from observations from the same location, month, climate variable, and all other years except year *y* from 1993 to 2016.

In [41]:
hindcasts = hf.read_raw_model_data() # 1993-2016
observations = hf.read_observed_weather() # 1993-2016
climatology = hf.create_climatology_data(observations) # Leave-One-Out 1993-2016

observations = observations.loc[("WS", 11, [1, 2, 3, 4], list(range(1993, 2017)))] # 1993-2016

hindcasts.head(1)
observations.head(1)
climatology.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time,tmean,tmax,tmin,rain
model,init_month,zone,year,month,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ECMWF,4,1,1993,4,1993-04-02,19.825406,26.905211,17.873243,5.589371


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time,tmean,tmax,tmin,rain
model,init_month,zone,year,month,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WS,11,1,1993,4,1993-04-01,21.9,26.6,17.2,0.0


Unnamed: 0,zone,year,month,tmean,tmax,tmin,rain
0,1,1993,4,19.311304,24.675797,13.946812,147.091304


## 2. Bias-Adjustment <a name="bias_adjustment"></a>

Biases are systematic errors between forecasts and observations that come from inaccuracies in the model design and the sensitivity of climate models to initial conditions (see, e.g. [ECMWF-Wiki](https://confluence.ecmwf.int/display/CKB/Seasonal+forecasts+and+the+Copernicus+Climate+Change+Service)). We use [scaled (normal) distribution mapping](https://hess.copernicus.org/articles/21/2649/2017/) to adjust biases in forecasted daily mean, maximum, and minimum air temperature. We do not apply any bias adjustment to rain forecasts as it did not lead to improvements in mean absolut error. We adjust temperature values by *model*, *init_month*, *zone*, and *month* for each year *y* using observations and hindcasts from all other years (Leave-One-Out) to avoid overfitting.

In [42]:
hindcasts_temp_adjusted = hf.adjust_temperature_bias(observations, hindcasts)

## 3. Dataset Completion <a name="dataset_completion"></a>

We need monthly climate features for August, September, and October. Hindcasts that are initialized between April and July provide forecasts over the entire relevant period from August to October. Hindcasts that are initialized later, e.g. in September, need to be supplemented with climate observations for days in the relevant period that are in the past, e.g. August.

In [43]:
hindcast_complete = hf.fill_missing_dates_with_observations(observations, hindcasts_temp_adjusted) 

Validation that for each *model*, *init_month*, *zone*, and *year* we have the same number of observations: \
 30 days for April + 31 days for May + 30 days for June + 31 days for July + 31 days for Aug + 30 days for Sept + 31 days for Oct = 214 days.

In [44]:
hindcast_complete.reset_index().groupby(["model", "init_month", "zone", "year"]).size().unique()

array([214], dtype=int64)

We concatenate the hindcasted daily values with the fully observed daily values.

In [46]:
climate_records_complete = pd.concat([hindcast_complete, observations]).sort_index()

## 4. Feature Preparation <a name="feature_preparation"></a>

### 4.1 Feature Computation 

From the daily values we calculate monthly climate indices.

In [48]:
features_hindcasts_observations = hf.aggregate_data(climate_records_complete)
features_hindcasts_observations.head(2)

Unnamed: 0,model,init_month,zone,year,Tmean_Apr,Tmean_Aug,Tmean_July,Tmean_June,Tmean_May,Tmean_Oct,Tmean_Sep,Tmax_Apr,Tmax_Aug,Tmax_July,Tmax_June,Tmax_May,Tmax_Oct,Tmax_Sep,Tmin_Apr,Tmin_Aug,Tmin_July,Tmin_June,Tmin_May,Tmin_Oct,Tmin_Sep,Rain_Apr,Rain_Aug,Rain_July,Rain_June,Rain_May,Rain_Oct,Rain_Sep
0,ECMWF,4,1,1993,19.539987,15.056569,13.517173,14.044408,14.886756,19.00233,16.206275,24.910731,20.066604,18.15811,18.437658,19.549011,24.250512,21.442299,14.301537,9.77741,8.673177,9.58073,10.012901,13.857031,11.072509,117.513733,165.319824,148.970947,149.98169,187.964478,223.339844,177.917481
1,ECMWF,4,1,1994,19.093194,15.01986,13.57573,14.539903,15.584468,18.899465,15.657911,24.983884,20.21171,18.36135,19.008067,20.476021,24.296008,20.71276,13.501505,9.849027,8.712816,10.158976,10.801739,13.576093,10.618096,118.288116,168.4375,140.563965,135.443115,143.670654,221.113281,203.830566


We include climatology features.

In [51]:
features_climatology = hf.create_climatology_features(features_hindcasts_observations, climatology)
features_complete = (pd
                     .concat([features_hindcasts_observations, features_climatology])
                     .sort_values(["model", "init_month", "zone", "year"])
                     .drop_duplicates()
                     .reset_index(drop=True))

### 4.2 Remove Correlated Features

We will fit our model on observed monthly climate features from August to October. Features need to be uncorrelated, which is why we will drop columns that have a correlation coefficient higher than 0.9.

In [55]:
relevant_columns = ['Tmean_Apr', 'Tmean_Aug',
       'Tmean_July', 'Tmean_June', 'Tmean_May', 'Tmean_Oct', 'Tmean_Sep',
       'Tmax_Apr', 'Tmax_Aug', 'Tmax_July', 'Tmax_June', 'Tmax_May',
       'Tmax_Oct', 'Tmax_Sep', 'Tmin_Apr', 'Tmin_Aug', 'Tmin_July',
       'Tmin_June', 'Tmin_May', 'Tmin_Oct', 'Tmin_Sep', 'Rain_Apr', 'Rain_Aug',
       'Rain_July', 'Rain_June', 'Rain_May', 'Rain_Oct', 'Rain_Sep']
values = features_complete.loc[(features_complete["model"] == "WS"), relevant_columns]
cor_matrix = values.corr().abs().round(2)
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
print(to_drop)

features_complete = features_complete.drop(to_drop, axis=1)

['Tmax_Apr', 'Tmax_Aug', 'Tmax_July', 'Tmax_June', 'Tmax_May', 'Tmax_Oct', 'Tmax_Sep', 'Tmin_Apr', 'Tmin_Aug', 'Tmin_July', 'Tmin_June', 'Tmin_May', 'Tmin_Oct', 'Tmin_Sep']


In [56]:
features_complete.sample(5)

Unnamed: 0,model,init_month,zone,year,Tmean_Apr,Tmean_Aug,Tmean_July,Tmean_June,Tmean_May,Tmean_Oct,Tmean_Sep,Rain_Apr,Rain_Aug,Rain_July,Rain_June,Rain_May,Rain_Oct,Rain_Sep
369,CLIMATE,7,4,2002,22.13,16.224957,14.917606,18.096667,18.185484,19.160744,17.294139,38.6,82.134783,125.604348,26.5,157.2,181.347826,153.530435
2461,NCEP,5,3,2006,20.345,18.354846,15.809371,16.749492,17.172248,20.974523,21.623161,119.0,72.586231,152.803388,46.277283,76.92082,239.181671,93.642739
2085,MME,9,3,2014,21.960338,19.314516,16.83871,18.538937,18.736769,22.168919,20.648489,146.0,30.0,110.5,325.5,205.4,182.345416,113.49707
1754,MME,6,2,1995,21.58,20.256502,18.00821,18.556858,19.458065,24.223596,22.51637,33.8,50.067425,47.261363,44.507229,53.0,142.414998,79.554317
1112,ECMWF,7,3,2001,22.820497,18.569947,16.793739,17.574634,18.24856,22.258467,19.438857,67.2,60.085449,97.486877,104.8,102.1,182.01416,117.260742


It is not surprising that Tmax and Tmin features highly correlate with Tmean and are therefore dropped.

### 4.3 Unstack Features by Zone

Previously, in the approach of [Nóia Júnior et al., 2021](https://iopscience.iop.org/article/10.1088/1748-9326/ac26f3), separate models were trained for each location (agro-climatic homogeneous groups) and their estimates where extrapolated to national level using harvested area estimates for each group. We now choose a different approach, where we directly estimate national wheat yield and the model can decide which location and climate feature it can assign more importance to. We simply need to unstack the *zone* column. The feature names will now hold an additional suffix *_n*, where *n* ranges from 1 to 4, indicating the location where that climate feature belongs to.

In [57]:
features_complete_unstacked = features_complete.set_index(["zone", "model", "init_month", "year"]).unstack(0)
features_complete_unstacked.columns = [str(s[0]) + "_" + str(s[1]) for s in features_complete_unstacked.columns]
features_complete_unstacked = features_complete_unstacked.reset_index()
features_complete_unstacked.sample(5)

Unnamed: 0,model,init_month,year,Tmean_Apr_1,Tmean_Apr_2,Tmean_Apr_3,Tmean_Apr_4,Tmean_Aug_1,Tmean_Aug_2,Tmean_Aug_3,Tmean_Aug_4,Tmean_July_1,Tmean_July_2,Tmean_July_3,Tmean_July_4,Tmean_June_1,Tmean_June_2,Tmean_June_3,Tmean_June_4,Tmean_May_1,Tmean_May_2,Tmean_May_3,Tmean_May_4,Tmean_Oct_1,Tmean_Oct_2,Tmean_Oct_3,Tmean_Oct_4,Tmean_Sep_1,Tmean_Sep_2,Tmean_Sep_3,Tmean_Sep_4,Rain_Apr_1,Rain_Apr_2,Rain_Apr_3,Rain_Apr_4,Rain_Aug_1,Rain_Aug_2,Rain_Aug_3,Rain_Aug_4,Rain_July_1,Rain_July_2,Rain_July_3,Rain_July_4,Rain_June_1,Rain_June_2,Rain_June_3,Rain_June_4,Rain_May_1,Rain_May_2,Rain_May_3,Rain_May_4,Rain_Oct_1,Rain_Oct_2,Rain_Oct_3,Rain_Oct_4,Rain_Sep_1,Rain_Sep_2,Rain_Sep_3,Rain_Sep_4
747,NCEP,11,1996,19.945,22.936667,21.943333,19.903333,16.353226,20.435484,19.124194,15.88871,11.253226,16.993548,15.5,12.575806,12.101667,17.555,16.008333,14.091667,16.267742,19.725806,18.769355,16.354839,18.180645,22.609677,20.622581,18.68871,14.921667,20.42151,18.546667,15.906667,75.9,59.6,45.1,88.3,213.9,26.4,45.5,80.3,126.1,11.2,6.7,77.7,140.6,8.0,45.9,148.8,73.9,31.2,39.2,2.0,157.8,183.4,263.6,195.0,119.3,178.1,161.5,182.0
166,CLIMATE,10,2015,19.488333,23.673333,21.895,19.4,18.646774,21.651613,21.482258,17.391935,14.022581,18.658065,16.853226,16.037097,14.19,19.276667,17.748333,14.595,16.214516,19.964516,18.68871,16.81129,18.852875,23.526526,22.126541,19.158798,17.223333,23.32,22.708333,20.775,147.7,70.2,36.9,74.0,55.0,35.0,42.4,34.6,322.5,346.0,477.1,273.4,172.8,8.4,57.0,72.6,166.3,142.2,194.6,145.0,241.978261,149.83913,184.291304,179.682609,200.4,269.0,267.9,131.2
854,UKMO,7,2007,20.336667,24.62,22.626667,20.447275,16.085827,21.000179,19.471978,16.605681,13.106454,18.860596,17.537453,14.947101,14.695,19.68,17.646667,15.659531,13.379032,19.491935,17.501613,15.257533,19.811788,24.436145,23.108832,19.895415,15.891869,22.590291,21.010552,17.762562,254.5,42.0,158.9,71.0,177.663444,77.59199,106.999134,95.234206,101.484923,53.588212,85.680055,96.296707,68.2,9.4,0.9,4.3,295.3,92.0,106.1,159.4,263.487346,157.742935,185.559544,184.900871,190.592574,83.666891,118.914281,133.65146
869,UKMO,8,1998,17.965,21.996667,20.511667,19.246667,15.156552,19.698869,17.797764,16.260221,14.766129,18.5,17.882258,14.853226,13.413333,16.953333,16.14,13.566667,15.203226,18.854839,17.408065,15.896774,18.993637,23.231449,22.040799,18.651553,15.370661,21.521104,19.757225,16.922044,342.2,243.6,441.1,281.2,233.137914,189.757839,263.08748,247.77129,191.0,57.4,28.1,136.9,82.7,27.4,114.7,100.4,201.0,106.2,91.0,70.2,239.024506,187.500776,198.662475,169.036991,262.903435,158.122885,211.78489,246.739918
580,NCEP,4,1997,19.073799,21.783881,20.328922,18.277834,15.14441,19.702081,18.222146,16.034844,11.049748,16.63847,14.959178,13.319182,14.391661,18.347295,17.239099,15.209194,15.295428,19.209404,17.880441,16.247557,19.316258,24.261141,22.951434,19.917873,17.115291,22.026886,20.81078,18.191452,65.119897,68.753761,52.886655,48.743834,154.935989,153.746819,174.58876,192.408355,95.188057,173.422406,182.199935,197.529671,191.454614,155.123185,193.282795,182.146079,156.292139,115.193306,126.667479,100.281781,267.042541,117.477661,145.992829,174.933746,206.893768,63.619102,89.277252,98.034439


## 5. Include Yield Data <a name="yield_data"></a>

We will now read the national detrended wheat yield data to be merged with our feature dataset. The wheat yield data was obtained from the [Brazilian Institute of Geography and Statistics](https://sidra.ibge.gov.br/tabela/1612). For more information on the data, see the other notebook *prepare_wheat_data*.

In [58]:
yield_national = hf.read_national_wheat_yield()

Our final dataset:

In [61]:
features_complete_unstacked = features_complete_unstacked.loc[:, [c for c in features_complete_unstacked.columns if ("July" not in c)
                         & ("June" not in c) & ("May" not in c) & ("Apr" not in c)]]

In [62]:
kfold_cv_dataset = (features_complete_unstacked
                    .merge(yield_national, how="left", on="year")
                    .dropna()
                    .reset_index(drop=True))

In [63]:
kfold_cv_dataset.sample(3)

Unnamed: 0,model,init_month,year,Tmean_Aug_1,Tmean_Aug_2,Tmean_Aug_3,Tmean_Aug_4,Tmean_Oct_1,Tmean_Oct_2,Tmean_Oct_3,Tmean_Oct_4,Tmean_Sep_1,Tmean_Sep_2,Tmean_Sep_3,Tmean_Sep_4,Rain_Aug_1,Rain_Aug_2,Rain_Aug_3,Rain_Aug_4,Rain_Oct_1,Rain_Oct_2,Rain_Oct_3,Rain_Oct_4,Rain_Sep_1,Rain_Sep_2,Rain_Sep_3,Rain_Sep_4,yield
753,NCEP,11,2002,15.872581,22.148387,19.769355,17.281974,19.446774,26.209677,24.020968,20.10364,14.541667,21.288333,18.63,15.897312,233.8,85.2,104.0,99.3,372.3,28.5,150.9,138.3,253.6,80.6,151.6,176.8,2194.415589
410,MME,5,1995,15.003472,20.067544,18.310682,15.751153,19.17606,24.421094,22.953966,19.603388,15.722839,22.01874,20.322672,17.194451,120.404055,61.254054,83.598636,96.678027,217.393551,130.337361,163.061159,171.711539,192.093574,66.328859,92.829288,104.612011,2641.626503
151,CLIMATE,10,2000,14.751613,19.63745,18.540323,15.459677,18.822581,23.535197,22.14807,19.123314,15.713333,20.686673,19.613333,16.336667,83.8,93.3,208.9,110.4,238.704348,157.604348,187.843478,174.921739,169.0,171.0,252.6,249.9,2340.684178


### Summary of our dataset

We are now finished with the preprocessing. Let's quickly summarize the data that we will train our model on. 
- There are 24 years, from 1993 to 2016
- For each year, we have 6 different model sources: ECMWF, NCEP, UKMO, MME, CLIMATE, WS (observations)
- WS has one data point per year, the other models have eight data points, one for each month of initialization from April to November
- This gives us 24 * (5 * 8 + 1) = 984 data points

In [64]:
kfold_cv_dataset.shape

(984, 28)

In [23]:
#kfold_cv_dataset.to_csv("kfold_cv_dataset.csv", index=False)

In [24]:
kfold_cv_dataset = pd.read_csv("kfold_cv_dataset.csv")

## 6. K-Fold Cross Validation <a name="cross_validation"><a/>

### 6.1 Completely modified Version

In [65]:
models = ["ECMWF", "NCEP", "UKMO", "MME", "CLIMATE", "WS"]
results = pd.DataFrame(0, index=models, columns=months_of_crop_season)
for im in months_of_crop_season:
    for model in models:
        res = hf.kfold_cross_validation(kfold_cv_dataset, model=model, init=im, no_of_features=8)
        # rmse
        metric = 100 * mse(res["yield"], res["predicted"], squared=False)/(res["yield"].mean())
        # coefficient of determination
        #metric = r2(res["yield"], res["predicted"])
        results.loc[model, im] = np.round(metric, 2)

In [6]:
results # coefficient of determination with 8 featurea

Unnamed: 0,4,5,6,7,8,9,10,11
ECMWF,11.56,11.9,11.23,11.44,10.58,9.49,7.88,6.01
NCEP,14.34,14.88,14.7,12.74,11.84,10.45,9.26,6.01
UKMO,12.28,12.71,10.79,12.13,11.07,10.61,7.64,6.01
MME,12.12,12.47,11.76,11.62,10.47,9.86,7.9,6.01
CLIMATE,11.75,11.75,11.75,11.75,11.75,10.27,9.13,6.01
WS,6.01,6.01,6.01,6.01,6.01,6.01,6.01,6.01


### 6.2 National yield extrapolation

In [81]:
yield_by_group = pd.read_csv("Data/Wheat/yield_by_group_detrended.csv")
train_dataset = features_complete.merge(yield_by_group, on=["zone", "year"], how="left")
train_dataset_modified_window = train_dataset.loc[:, [c for c in train_dataset.columns if ("July" not in c)
                         & ("June" not in c) & ("May" not in c) & ("Apr" not in c)]]

In [86]:
def national_yield_extrapolated(data, national_yield, model="ECMWF", init=8, no_of_features=6):
    """
    Returns scores on LOO-CV.
        Params:
            data, dataframe: all features and targets by group and year for all models
            national_yield, dataframe: national trend-corrected yield from 1993-2016
            model, string: model that is evaluated
            init, int: init_month that is evaluated
        Returns:
            result, dataframe: national yield forecasts by year
    """
    contributions_to_national_yield = {1:0.37, 2:0.23, 3:0.23, 4:0.18}
    
    # Filter by model and init_month but also include observations that are used for model training
    cv_dataset = (data.loc[(data["model"].isin([model, "WS"])) 
                           & (data["init_month"].isin([init, 11]))])
    # Dataframe where interim results are saved
    national_forecasts_by_year = (pd.DataFrame(data={"year":crop_seasons, "predicted":np.zeros(24)})
                                  .merge(national_yield, on="year", how="left"))
    # Features
    relevant_columns = [c for c in cv_dataset.columns if c not in ["model", "init_month", "zone", "year", "yield"]]
    
    for season in crop_seasons:
        for group in list(range(1,5)):
            X_train = cv_dataset.loc[(cv_dataset["model"] == "WS")
                                      & (cv_dataset["zone"] == group)
                                       & (cv_dataset["year"] != season), relevant_columns]
            y_train = cv_dataset.loc[(cv_dataset["model"] == "WS")
                                      & (cv_dataset["zone"] == group)
                                       & (cv_dataset["year"] != season), "yield"]
            # To overcome variance threshold
            if model == "CLIMATE": X_train += np.random.normal(0, 1e-6, X_train.shape) 
            
            pipeline = Pipeline([('scaler', StandardScaler()), 
                                 ('var', VarianceThreshold()), 
                                 ('selector', SelectKBest(f_regression, k=no_of_features)),
                                 ('estimator', Ridge())])
            reg = pipeline.fit(X_train, y_train)  
            X_val = cv_dataset.loc[(cv_dataset["model"] == model)
                                    & (cv_dataset["zone"] == group)
                                     & (cv_dataset["year"] == season), relevant_columns].reset_index(drop=True)
                
            y_predicted = reg.predict(X_val)[0]
            
            # each forecast is weighted by the group's relative contribution to national harvested area
            national_forecasts_by_year.loc[national_forecasts_by_year["year"] == season, "predicted"] += y_predicted * contributions_to_national_yield[group]
    return national_forecasts_by_year

In [87]:
models = ["ECMWF", "NCEP", "UKMO", "MME", "CLIMATE", "WS"]
results = pd.DataFrame(0, index=models, columns=months_of_crop_season)
for im in months_of_crop_season:
    for model in models:
        res = national_yield_extrapolated(train_dataset_modified_window, yield_national, model=model, init=im)
        # rmse
        metric = 100 * mse(res["yield"], res["predicted"], squared=False)/(res["yield"].mean())
        # coefficient of determination
        #metric = r2(res["yield"], res["predicted"])
        results.loc[model, im] = np.round(metric, 2)

In [88]:
results

Unnamed: 0,4,5,6,7,8,9,10,11
ECMWF,11.67,11.89,11.21,11.49,11.15,11.18,9.5,8.83
NCEP,13.75,12.66,14.09,12.55,12.5,11.74,10.06,8.83
UKMO,11.74,12.49,11.16,12.23,11.06,11.0,9.35,8.83
MME,12.04,12.03,11.82,11.83,11.14,11.07,9.49,8.83
CLIMATE,12.09,12.09,12.09,12.09,12.09,11.46,10.25,8.83
WS,8.83,8.83,8.83,8.83,8.83,8.83,8.83,8.83


### 6.3 Time window expanded

In [90]:
models = ["ECMWF", "NCEP", "UKMO", "MME", "CLIMATE", "WS"]
results = pd.DataFrame(0, index=models, columns=months_of_crop_season)
for im in months_of_crop_season:
    for model in models:
        res = national_yield_extrapolated(train_dataset, yield_national, model=model, init=im, no_of_features=8)
        # rmse
        metric = 100 * mse(res["yield"], res["predicted"], squared=False)/(res["yield"].mean())
        # coefficient of determination
        #metric = r2(res["yield"], res["predicted"])
        results.loc[model, im] = np.round(metric, 2)

In [91]:
results

Unnamed: 0,4,5,6,7,8,9,10,11
ECMWF,12.03,12.52,11.83,12.18,12.5,12.87,11.62,10.94
NCEP,12.73,13.11,13.64,12.43,13.79,13.02,12.36,10.94
UKMO,11.77,12.98,12.26,12.32,12.1,12.54,11.58,10.94
MME,11.74,12.53,12.3,12.09,12.45,12.65,11.78,10.94
CLIMATE,12.09,12.52,12.68,12.51,12.6,12.78,11.83,10.94
WS,10.94,10.94,10.94,10.94,10.94,10.94,10.94,10.94


### 6.4 Original climate info