#### Find kernel with highest accuracy within Gaussian Process Regressor
#### Cross-validated randomised search of hyperparameters
#### For 20 random test grid cells, each with 500 iterations

In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.model_selection import (train_test_split,
                                     cross_val_score,
                                     RandomizedSearchCV,
                                     LeaveOneOut)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (DotProduct,
                                              ConstantKernel,
                                              WhiteKernel,
                                              Matern,
                                              ExpSineSquared,
                                              RationalQuadratic,
                                              RBF)

In [2]:
with open('/nfs/see-fs-02_users/earlacoa/emulator/dfs_gridcell_sample.pickle', 'rb') as ds:
    df_gridcells = pickle.load(ds)
    
df_gridcells.head()

Unnamed: 0,lat,lon,RES,IND,TRA,AGR,POW,PM2_5_DRY
0,30.5,114.25,0.23229,0.20507,0.29904,0.97374,0.55184,82.050903
1,30.5,114.25,1.1212,1.1098,0.8701,1.2222,1.252,260.18296
2,30.5,114.25,0.79843,1.2946,0.090709,0.75401,0.1942,209.802636
3,30.5,114.25,0.34445,1.4224,0.75112,1.2409,1.2113,164.002287
4,30.5,114.25,0.62562,1.3589,1.0583,0.04688,0.64196,196.083809


In [3]:
kernel_list = [Matern(nu=i, length_scale=j) for i in [0.5, 1.5, 2.5] for j in np.arange(0.1, 5, 0.1)] + \
              [ConstantKernel(constant_value=i) ** Matern(nu=j, length_scale=k) for i in np.arange(0.1, 5, 0.1) for j in [0.5, 1.5, 2.5] for k in np.arange(0.1, 5, 0.1)] + \
              [ConstantKernel(i) for i in np.arange(0.1, 5, 0.1)] + \
              [DotProduct(i) for i in np.arange(0.1, 5, 0.1)] + \
              [ConstantKernel(i) ** DotProduct(j) for i in np.arange(0.1, 5, 0.1) for j in np.arange(0.1, 5, 0.1)] + \
              [WhiteKernel(i) for i in np.arange(0.1, 5, 0.1)] + \
              [ExpSineSquared(i) for i in np.arange(0.1, 5, 0.1)] + \
              [RationalQuadratic(i) for i in np.arange(0.1, 5, 0.1)] + \
              [RBF(i) for i in np.arange(0.1, 5, 0.1)]

In [4]:
param_grid = {'kernel': kernel_list,
              'n_restarts_optimizer': np.arange(100, 200, 5),
              'normalize_y': [True, False]}

In [6]:
lats = df_gridcells[['lat', 'lon']].drop_duplicates()['lat'].values
lons = df_gridcells[['lat', 'lon']].drop_duplicates()['lon'].values

features = ['RES', 'IND', 'TRA', 'AGR', 'POW']
target = 'PM2_5_DRY'

for gridcell in df_gridcells[['lat', 'lon']].drop_duplicates().values:
    lat, lon = gridcell
    df_gridcell = df_gridcells.loc[df_gridcells.lat == lat].loc[df_gridcells.lon == lon]
    
    X = df_gridcell[features].values
    y = df_gridcell[target].values
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=123
    )
    
    loo = LeaveOneOut()
    #cv = loo.get_n_splits(X_train) # causes all NaNs at the moment, so using 5-fold for the time being
    cv = 5
    
    gp = GaussianProcessRegressor(random_state=123)
    emulator = RandomizedSearchCV(
        gp,         
        param_grid,
        cv=cv,
        n_jobs=-1,         
        n_iter=500,   
        random_state=123,                    
        verbose=2)
    emulator.fit(X_train, y_train)

    print(f"CV score: {emulator.best_score_:.4f}")
    print(f"test score: {emulator.score(X_test, y_test):.4f}")
    print(f"pearson R2: {pearsonr(y_test, emulator.predict(X_test))[0] ** 2:.4f}")
    print(f"best estimator: {emulator.best_estimator_}")

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.