### Package imports

In [1]:
from time import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# !pip install rpy2
import rpy2
from rpy2.robjects import pandas2ri
from rpy2 import robjects
from rpy2.robjects.packages import importr
pandas2ri.activate()

from polire import CustomInterpolator
from polire.preprocessing import SpatialFeatures

import warnings
warnings.filterwarnings('ignore')

## Tmax, Tmin, Tmean, SLP, Precipitation

In [2]:
def run_exp(i):
    var = var_list[i]
    fset = fset_list[i]
    mtry = param_grid['mtry'][i]
    min_node = param_grid['min.node.size'][i]
    fraction = param_grid['sample.fraction'][i]
    n_obs = param_grid['n.obs'][i]
    exponent = param_grid['p'][i]
    
    print(var)
    init = time()
    # Load data
    data = pd.read_csv('../../data/serbia/ogimet_serbia08_'+var+'.csv')
    data['date'] = data['date'].apply(lambda x: int(x.replace('-','')))
    
    # Create folds
    CAST = importr('CAST')
    rdata = pandas2ri.PandasDataFrame(data)
    out = CAST.CreateSpacetimeFolds(rdata, spacevar='staid', k=5, seed=0)
    
    # Execute exp for 5 folds
    df_list = []
    for i in range(5):
        # Select the fold
        trn_idx = out[0][i]-1
        tst_idx = out[1][i]-1
        trn_df = data.iloc[trn_idx]
        tst_df = data.iloc[tst_idx]
        trn_X = trn_df[fset]
        trn_y = trn_df[var]
#         print(i, trn_X.groupby('date').count()['doy'].min())

        # Transform features
        spat = SpatialFeatures(n_closest=n_obs, idw=True, idw_exponent=exponent, 
                               coordinate_type='Geographic', resolution='standard')

        trn_F = spat.fit_transform(trn_X.values, trn_y.values)

        # Training
        base_model = RandomForestRegressor(n_estimators=250, min_samples_leaf=min_node, 
                                           max_features=mtry, max_samples=fraction,
                                           random_state=0, n_jobs=-1)
        model = CustomInterpolator(base_model)
        model = model.fit(trn_F, trn_y.values)

        # Testing
        tst_X = tst_df[fset]
        tst_F = spat.transform(tst_X.values)

        y_pred = model.predict(tst_F)
        tst_df['pred'] = y_pred
        
        df_list.append(tst_df)
        
    pd.concat(df_list).to_csv('../../results/experiment-2/ogimet_'+var+'.csv')
    print('Job done in', (time()-init)/60, 'minutes')

In [3]:
var_list = ['tmax', 'tmin', 'tmean', 'slp']
fset_list = [['lon', 'lat', 'date', 'doy', 'dem', 'twi', 'gtt'],
            ['lon', 'lat', 'date', 'doy', 'dem', 'twi', 'gtt'],
            ['lon', 'lat', 'date', 'doy', 'dem', 'twi', 'gtt'],
            ['lon', 'lat', 'date', 'doy', 'dem']]
param_grid = {'mtry':[7,4,7,6], 'min.node.size':[15,11,14,11],
             'sample.fraction': [0.98, 0.93, 0.9999, 0.91],
             'n.obs':[10, 9, 9, 9], 'p': [2.9, 2.2, 3, 3.5]}

## For all vars, run experiment

In [None]:
for i in range(len(var_list)):
    run_exp(i)

tmax
Job done in 5.938937914371491 minutes
tmin
Job done in 5.290335786342621 minutes
tmean
