# Setup Freyberg pilot points to use regularization and see what happens...

### With our first attempt at pilot points, we saw bad things.  So lets see if we can fix the problem by using regularization

In [None]:
%matplotlib inline
import os, shutil
import sys
sys.path.append("..")
import numpy as np
from IPython.display import Image
import pandas as pd
import matplotlib.pyplot as plt

import flopy as flopy
import pyemu

In [None]:
import freyberg_setup as fs
fs.setup_pest_pp()
working_dir = fs.WORKING_DIR_PP
pst_name = fs.PST_NAME_PP

In [None]:
fs.plot_model(working_dir, pst_name)

In [None]:
pst = pyemu.Pst(os.path.join(working_dir,pst_name))

### In the ``intro_to_regularization``, we talked about two common forms of regularization.  Here we will add both types to the control file.  

### First, let's add a few preferred value equations for the recharge and well flux parameters:

In [None]:
pst.parameter_data.pargp.unique()

In [None]:
pyemu.helpers.zero_order_tikhonov(pst,par_groups=["rch","w0","w1"])
#pyemu.helpers.zero_order_tikhonov(pst)

In [None]:
pst.prior_information

### Now, let's add preferred difference regularization to the spatially distributed parameters - more geostatistics

In [None]:
v = pyemu.geostats.ExpVario(contribution=1.0,a=2500.0)
gs = pyemu.geostats.GeoStruct(variograms=v,nugget=0.0)
df_pp = pyemu.gw_utils.pp_tpl_to_dataframe(os.path.join(working_dir,"hkpp.dat.tpl"))
cov = gs.covariance_matrix(df_pp.x,df_pp.y,df_pp.parnme)
pyemu.helpers.first_order_pearson_tikhonov(pst,cov,reset=False,abs_drop_tol=0.1)
pst.prior_information

### Now we need to adjust the inputs to pest the control how regularization is enforced.  The big knob is ``phimlim``.  But first, we need to change the estimation mode to "regularization"

In [None]:
pst.control_data.pestmode = "regularization"
pst.reg_data.phimlim

### That's no good - way too low - just a "placeholder" value.  Theoritically, it should be the number of non-zero weighted obs, but we saw in the unregularized pilot point run, are best ``phi`` was about 180, so let's try 220 (just a guess)

In [None]:
pst.reg_data.phimlim = 220
pst.reg_data.phimaccept = 240
#pst.svd_data.maxsing = 3

In [None]:
pst.write(os.path.join(working_dir,pst_name))

In [None]:
os.chdir(working_dir)
pyemu.helpers.start_slaves('.',"pestpp",pst_name,num_slaves=15,master_dir='.')
os.chdir("..")

In [None]:
os.chdir(working_dir)
pst.parrep(pst_name.replace(".pst",".parb"))
pst.write_input_files()
pyemu.gw_utils.fac2real("hkpp.dat",factors_file="hkpp.dat.fac",out_file="hk_layer_1.ref")
os.chdir("..")

In [None]:
pst.plot(kind="phi_progress")

In [None]:
pst.plot(kind="1to1")

### Let's see how the parameter values look

In [None]:
df_paru = pd.read_csv(os.path.join(working_dir,pst_name.replace(".pst",".par.usum.csv")),index_col=0)

In [None]:
hk_pars = [p.upper() for p in pst.par_names if p.startswith("hk")]
df_hk = df_paru.loc[hk_pars,:]
ax = pyemu.helpers.plot_summary_distributions(df_hk,label_post=True)
mn = np.log10(pst.parameter_data.loc[hk_pars[0].lower(),"parlbnd"])
mx = np.log10(pst.parameter_data.loc[hk_pars[0].lower(),"parubnd"])
ax.plot([mn,mn],ax.get_ylim(),"k--")
ax.plot([mx,mx],ax.get_ylim(),"k--")

### Not a good sign...lots of parameters are still at their bounds...not as many, but still a lot of them

In [None]:
df_pp = pyemu.gw_utils.pp_tpl_to_dataframe(os.path.join(working_dir,"hkpp.dat.tpl"))
m = flopy.modflow.Modflow.load(fs.MODEL_NAM,model_ws=working_dir)
ax = m.upw.hk[0].plot(colorbar=True,alpha=0.5)
ax.scatter(df_pp.x,df_pp.y,marker='x')

In [None]:
m.upw.hk[0] = os.path.join(fs.BASE_MODEL_DIR,"hk.truth.ref")
ax = m.upw.hk[0].plot(colorbar=True,alpha=0.5)
ax.scatter(df_pp.x,df_pp.y,marker='x')

oh no!!!!!  We are still over fit...so let's back off the fit using ``phimilim``.  But first, just for fun, let's look at the forecast uncertainty

In [None]:
df_foreu = pd.read_csv(os.path.join(working_dir,pst_name.replace(".pst",".pred.usum.csv")),index_col=0)
figs, axes = pyemu.helpers.plot_summary_distributions(df_foreu,subplots=True)
for ax in axes:
    fname = ax.get_title().lower()
    ylim = ax.get_ylim()
    v = pst.observation_data.loc[fname,"obsval"]
    ax.plot([v,v],ylim,"b--")
    ax.set_ylim(ylim)

We are crushing the posterior uncertainty - it is considerable less than the prior for several forecasts. But, compared to the "truth", we are not doing well - the model is not reliable for many forecasts

# Adjust ``phimlim`` and rerun..

Let's adjust the weights and ``phimlim`` based on how well we fit last time...and see if we can eliminate our overfitting problem.  How will we know we have eliminated it? In the real world, we will never know...

In [None]:
pst = pyemu.Pst(os.path.join(working_dir,pst_name))
pst.res.loc[pst.nnz_obs_names]

In [None]:
pst.phi

In [None]:
pst.reg_data.phimlim = 500.0
pst.reg_data.phimaccept = 550.0
pst.reg_data.fracphim = 0.75
pst.svd_data.maxsing = 3
pst.write(os.path.join(working_dir,pst_name))

In [None]:
os.chdir(working_dir)
pyemu.helpers.start_slaves('.',"pestpp",pst_name,num_slaves=15,master_dir='.')
os.chdir("..")

In [None]:
os.chdir(working_dir)
pst.parrep(pst_name.replace(".pst",".parb"))
pst.write_input_files()
pyemu.gw_utils.fac2real("hkpp.dat",factors_file="hkpp.dat.fac",out_file="hk_layer_1.ref")
os.chdir("..")


In [None]:
pst.plot(kind="phi_progress")

In [None]:
pst = pyemu.Pst(os.path.join(working_dir,pst_name))
pst.phi

In [None]:
pst.plot(kind="1to1")

In [None]:
m = flopy.modflow.Modflow.load(fs.MODEL_NAM,model_ws=working_dir)
ax = m.upw.hk[0].plot(colorbar=True,alpha=0.5)
ax.scatter(df_pp.x,df_pp.y,marker='x')

That's much better...

In [None]:
m.upw.hk[0] = os.path.join(fs.BASE_MODEL_DIR,"hk.truth.ref")
ax = m.upw.hk[0].plot(colorbar=True,alpha=0.5)
ax.scatter(df_pp.x,df_pp.y,marker='x')

Now let's see how the uncertainty looks...

In [None]:
pst = pyemu.Pst(os.path.join(working_dir,pst_name))
pst.phi

In [None]:
df_paru = pd.read_csv(os.path.join(working_dir,pst_name.replace(".pst",".par.usum.csv")),index_col=0)
hk_pars = [p.upper() for p in pst.par_names if p.startswith("hk")]
df_hk = df_paru.loc[hk_pars,:]
ax = pyemu.helpers.plot_summary_distributions(df_hk,label_post=True)
mn = np.log10(pst.parameter_data.loc[hk_pars[0].lower(),"parlbnd"])
mx = np.log10(pst.parameter_data.loc[hk_pars[0].lower(),"parubnd"])
ax.plot([mn,mn],ax.get_ylim(),"k--")
ax.plot([mx,mx],ax.get_ylim(),"k--")

Ahh yeah!

In [None]:
figs, axes = pyemu.helpers.plot_summary_distributions(os.path.join(working_dir,
                    pst_name.replace(".pst",".pred.usum.csv")),subplots=True)
for ax in axes:
    fname = ax.get_title()
    pyemu.helpers.plot_summary_distributions(df_foreu.loc[[fname],:],ax=ax,pt_color='g')
    fname = fname.lower()
    ylim = ax.get_ylim()
    v = pst.observation_data.loc[fname,"obsval"]
    ax.plot([v,v],ylim,"b--")
    ax.set_ylim(ylim)

We are now bracketing the "truth" with significant probability for most forecast (FINALLY!!!). So, even though we aren't fitting the observations as well (hence posterior uncertainty), we are doing much better from a reliability stand point. 