# Kriging with external drift estimation on Jura

## Load Packages

In [None]:
import numpy as np
import pandas as pd
import gstlearn as gl
import gstlearn.plot as gp
import matplotlib.pyplot as plt



## Load the data set and the prediction grid


We start by loading the data and the prediction grid

In [None]:
# Dataset
jura_all=pd.read_csv("jura/jura_pred.csv")
# Prediction grid
grid = pd.read_csv("jura/jura_grid.csv")

Change the names of the Land Use and Rock in order to be consistent with their names on the grid

In [None]:
## Replace landuse interger code by name
landuse_codes=[1,2,3,4]
landuse_names=["Forest","Pasture","Meadow","Tillage"]
jura_all["Landuse"]=jura_all["Landuse"].replace(landuse_codes,landuse_names)

## Replace rock type interger code by name
rock_codes=[1,2,3,4,5]
rock_names=["Argovian","Kimmeridgian","Sequanian","Portlandian","Quaternary"]
jura_all["Rock"]=jura_all["Rock"].replace(rock_codes,rock_names)

Build the predictor variables corresponding to Rock with one-hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
# drop the first column for each feature
enc = OneHotEncoder(handle_unknown='ignore',drop='first')
enc.fit(jura_all[["Rock"]])
rock_indic_jura = pd.DataFrame(enc.transform(jura_all[["Rock"]]).toarray(),columns = ["Rock_K","Rock_P","Rock_Q","Rock_S"])
jura_all = pd.concat([jura_all,rock_indic_jura],axis=1)
rock_indic_grid = pd.DataFrame(enc.transform(grid[["Rock"]]).toarray(),columns = ["Rock_K","Rock_P","Rock_Q","Rock_S"])
grid = pd.concat([grid,rock_indic_grid],axis=1)

Separate the data set in two sets : the training set and the validation set.
For the project and the Kaggle competition, you should use the full data set for 
the training.
You will submit your prediction on Kaggle for a set of locations on which you will 
only know the locations and the factors of Land Use and Rock at these locations.

In [None]:
ntot = jura_all.shape[0]
ntrain = 200
nval = ntot - ntrain

indtrain = np.random.choice(ntot,ntrain,replace=False).astype(int)
indval = np.setdiff1d(np.arange(ntot),indtrain)

jura =jura_all.loc[indtrain,:]

#val contains the values to predict. For the project, these values will be on Kaggle
#(for other locations) and you won't know them
#You will have the locations and covariables at the unknown locations by the following command :
val_loc =jura_all.loc[indval,['Xloc','Yloc',"Rock_K","Rock_P","Rock_Q","Rock_S"]]
val=jura_all.loc[indval,['Co']]

## Gstlearn objects

First, we create a gstlearn database containing the data points, and assign the appropriate locators to the variables.

In [None]:
## Create Db
db_jura=gl.Db_fromPanda(jura)

## Set locators
db_jura.setLocators(['Xloc','Yloc'],gl.ELoc.X) # -> Role = Coordinates
db_jura.setLocators(['Co'],gl.ELoc.Z) # -> Role  = Variable of interest
db_jura.setLocators(["Rock*"],gl.ELoc.F) # -> Role = Drift functions

db_jura.display()

We also create a gstlearn *Grid Database* containing the target grid for the prediction.

In [None]:
### Load grid data into  point database
db_grid_pts=gl.Db_fromPanda(grid)
db_grid_pts.setLocators(["Xloc","Yloc"],gl.ELoc.X)
db_grid_pts.setLocators(["Rock*"],gl.ELoc.F)

### Create empty grid database with right dimensions
db_grid=gl.DbGrid.createCoveringDb(db_jura,dx=[0.05,0.05],margin=[0.2,0.2])

### Migrate variables from point database to grid database
err=gl.migrateMulti(db_grid_pts,db_grid,
                    names=["Rock_K","Rock_P","Rock_Q","Rock_S"],
                    namconv=gl.NamingConvention())

### Add selection 
db_grid.addSelection(~np.isnan(db_grid["Rock_K"]))## Create DbGrid that covers the Db containing the data
db_grid.setLocators(["Rock*"],gl.ELoc.F)
db_grid.display()

Finally, we create a Db containing the validation locations and the value of Cobalt concentrations at those locations.

In [None]:
## Create Db
db_val=gl.Db_fromPanda(val_loc)

## Set locators
db_val.setLocators(['Xloc','Yloc'],gl.ELoc.X) # -> Role = Coordinates
db_val.setLocators(["Rock*"],gl.ELoc.F) # -> Role = Drift functions

## Add Co values
db_val["Co"]=val

db_val.display()

## Variography of the residuals

Define a model with a constant mean (*order = 0*) and the number of variables with a f locator the we want to work with (*nfex = 4*)

In [None]:
EDmodel = gl.Model()
EDmodel.setDriftIRF(nfex = 4)

Compute the variogram of the residuals

In [None]:
## Create experimental variogram parametrization:
## Setup parameters for a variogram with 30 lags separated by a distance 0.1 (meaning that we compute the variogram at lags h=0.1*i for i=0,...,30),
## and consider a tolerance τ=50% on the distance
varioParamOmni = gl.VarioParam.createOmniDirection(npas=30, dpas=0.1, toldis=0.5)

## Create experimental variogram object with specified parameters
varioRaw = gl.Vario(varioParamOmni) #Raw variable for comparison purpose
varioKED = gl.Vario(varioParamOmni)

## Compute experimental variogram
err = varioRaw.compute(db_jura)
err = varioKED.compute(db_jura,model=EDmodel) #We pass the model to indicate
                                              #that we work on residuals which will
                                              #be computed by the function.

## Plot
ax = gp.varmod(varioRaw,showPairs=False,label = "Raw")
ax = gp.varmod(varioKED,showPairs=False,color = "r",label = "Residual")
ax = plt.legend()

Model fitting

In [None]:
## Create Model objects
fitmodRaw = gl.Model()
fitmodKED = gl.Model()

## set the drift within the model
fitmodRaw.setDriftIRF(order = 0)
fitmodKED.setDriftIRF(order = 0, nfex = 4)

## Fit model on experimental variogram
err = fitmodRaw.fit(varioRaw, types = [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL])
err = fitmodKED.fit(varioKED, types = [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL])

## Plot
gp.varmod(varioKED, fitmodKED)
gp.varmod(varioRaw, fitmodRaw,color="r")
plt.show()

## Kriging with external Drift

The *kriging* function is called to perform the kriging with external drift

In [None]:
## Set the neighborhood
uniqueNeigh = gl.NeighUnique.create()

## Remove variables starting with a given prefix (-> Results from previous runs)
db_grid.deleteColumns(["KED*"])
db_grid.deleteColumns(["OK*"])

## Compute kriging
err = gl.kriging(dbin=db_jura, dbout=db_grid, model=fitmodKED, 
              neigh=uniqueNeigh,
              flag_est=True, flag_std=True, flag_varz=False, ## To compute the predictor and its standard-deviation, but not its variance
              namconv=gl.NamingConvention("KED") ## Prefix that will be used to identify the results in the output database
              )

## Compute ordinary kriging for comparison
err = gl.kriging(dbin=db_jura, dbout=db_grid, model=fitmodRaw, 
              neigh=uniqueNeigh,
              flag_est=True, flag_std=True, flag_varz=False, ## To compute the predictor and its standard-deviation, but not its variance
              namconv=gl.NamingConvention("OK") ## Prefix that will be used to identify the results in the output database
              )
## Display database
db_grid.display()

We can ask for the regression coefficients using *regression*.

In [None]:
regResults = gl.regression(db_jura, nameResp="Co",  model=fitmodKED, mode=2)
regResults.display

In [None]:
## Plot prediction
fig, ax = gp.initGeographic()
ax.raster(db_grid, name="KED*estim",flagLegend=True)
ax.symbol(db_jura, c='black')
ax.decoration(title="Kriging with external Drift prediction")
plt.show()

## Plot kriging standard-deviation
fig, ax = gp.initGeographic()
ax.raster(db_grid, name="KED*stdev",flagLegend=True)
ax.symbol(db_jura, c='black')
ax.decoration(title="Kriging with external Drift standard-deviation")
plt.show()

Computing the KED prediction at the validation locations and the resulting RMSE can then be done as follows

In [None]:
## Remove variables starting with a given prefix (-> Results from previous runs)
db_val.deleteColumns(["OK*"])
db_val.deleteColumns(["KED*"])

## Compute kriging
err = gl.kriging(dbin=db_jura, dbout=db_val, model=fitmodKED, 
              neigh=uniqueNeigh,
              flag_est=True, flag_std=True, flag_varz=False, ## To compute the predictor and its standard-deviation, but not its variance
              namconv=gl.NamingConvention("KED") ## Prefix that will be used to identify the results in the output database
              )

err = gl.kriging(dbin=db_jura, dbout=db_val, model=fitmodRaw, 
              neigh=uniqueNeigh,
              flag_est=True, flag_std=True, flag_varz=False, ## To compute the predictor and its standard-deviation, but not its variance
              namconv=gl.NamingConvention("OK") ## Prefix that will be used to identify the results in the output database
              )
## Compute RMSE
rmse_KED=np.mean((db_val["Co"]-db_val["KED*estim"])**2)**0.5
rmse_OK=np.mean((db_val["Co"]-db_val["OK*estim"])**2)**0.5

print("Ordinary Kriging RMSE",rmse_OK)
print("KED RMSE",rmse_KED)

## Questions

1. Improve the model by adding other explanatory variables (e.g. Landuse or the interactions Rock*Landuse).
2. Estimate the model parameters by maximum Likelihood.
3. Define and adjust a multivariate model with drift. Compute the associated predictions.