<center><h1> Linear Example: Dependence on Dimension

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy import stats, linalg
import scipy as sp

from mud.util import transform_setup, transform_linear_map, std_from_equipment, createRandomLinearPair, createRandomLinearProblem
from mud.funs import mud_sol, map_sol

In [None]:
plt.rcParams['figure.figsize'] = 10,10
plt.rcParams['font.size'] = 16
fsize = 32

---
---

---

# Impact of Dimension for Various Choices of $\Sigma_\text{init}$
We sequentially incorporate $D=1, \dots , P$ dimensions into our QoI map and study the 2-norm between the true value that was used to generate the data and the analytical MUD/MAP points. 

In [None]:
def randP(dim_input, dim_output, seed=27):
    np.random.seed(seed)
    lam_ref = np.random.rand(dim_input).reshape(-1,1)
    A = np.random.randn(dim_output, dim_input)
#     A = linalg.orth(A)
#     Q, R = np.linalg.qr(A)
#     A = Q
#     A = np.random.rand(dim_output, dim_input)*2 - 1

#     A = np.eye(dim_input)
#     A = A[0:dim_output,:]
    b = np.random.randn(dim_output).reshape(-1,1)
#     b = np.random.rand(dim_output).reshape(-1,1)
#     b = np.zeros(dim_output).reshape(-1,1)
    y = A@lam_ref + b
    return lam_ref, A, b, y

In [None]:
dim_input, dim_output = 100, 100
initial_mean = np.zeros(dim_input).reshape(-1,1)
lam_ref, A, b, d = randP(dim_input, dim_output)
prefix='lin'

In [None]:
def numnonzero(x, tol=1E-4):
    return len(x[abs(x)<tol])

In [None]:
%%time
sols = {}
dim_output
tol_list = [10**(n) for n in np.linspace(-2,2,11)]
std_list = [std_from_equipment(tol) for tol in tol_list]
for std in std_list:
    sols[std] = []
    for o in range(0,dim_output+1, 1):
        _A = A[0:o, :]
        _b = b[0:o,:]
        _d = d[0:o,:]
        _mud = mud_sol(_A, _b, _d, initial_mean, std**2*np.eye(dim_input))
        _map = map_sol(_A, _b, _d, initial_mean, std**2*np.eye(dim_input))
        _pin = (np.linalg.pinv(_A)@(_d-_b)).reshape(-1,1)
        sols[std].append((_mud, _map, _pin))

In [None]:
# c = np.linalg.cond(A)*np.linalg.norm(lam_ref)
c = 1
err_mud_list = [[np.linalg.norm(_m[0] - lam_ref)/c for _m in sols[std]] for std in std_list ] # output_dim+1 values of _m
err_map_list = [[np.linalg.norm(_m[1] - lam_ref)/c for _m in sols[std]] for std in std_list ]
err_pin_list = [[np.linalg.norm(_m[2] - lam_ref)/c for _m in sols[std]] for std in std_list ]

# c = np.linalg.cond(A)
c = np.linalg.norm(A)
err_Amud_list = [[np.linalg.norm(A@(_m[0] - lam_ref))/c for _m in sols[std]] for std in std_list ]
err_Amap_list = [[np.linalg.norm(A@(_m[1] - lam_ref))/c for _m in sols[std]] for std in std_list ]
err_Apin_list = [[np.linalg.norm(A@(_m[2] - lam_ref))/c for _m in sols[std]] for std in std_list ]

# measure # of components that agree
# err_mud_list = [[numnonzero(_m[0] - lam_ref) for _m in sols[std]] for std in std_list ]
# err_map_list = [[numnonzero(_m[1] - lam_ref) for _m in sols[std]] for std in std_list ]
# err_pin_list = [[numnonzero(_m[2] - lam_ref) for _m in sols[std]] for std in std_list ]

In [None]:
x, y = np.arange(1,1+dim_output,1), err_mud_list[0][0:-1]

slope, intercept = (np.linalg.pinv(np.vander(x, 2))@np.array(y).reshape(-1,1)).ravel()
regression = slope*x + intercept

---

## Surface Plot

In [None]:
X, Y = np.meshgrid(x,std_list)
ZU = np.array(err_mud_list)[:,0:100]
ZA = np.array(err_map_list)[:,0:100]

In [None]:
# import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, np.log10(Y), ZU, alpha=0.5)
ax.plot_surface(X, np.log10(Y), ZA, alpha=0.5)
ax.set(ylabel='log10(Standard Deviation)', xlabel='Output Dimension', zlabel='Error')
# ax.set(yscale='log')
ax.view_init(15, 15)
plt.savefig(f'{prefix}-surface-error.pdf', bbox_inches='tight')
plt.show()

In [None]:
A

In [None]:
for idx, std in enumerate(std_list):
    if idx>0: plt.annotate(f"$\sigma$={std:1.2E}", (100, err_map_list[idx][-1]), fontsize=24)
    _err_mud = err_mud_list[idx]
    _err_map = err_map_list[idx]
    _err_pin = err_pin_list[idx]
    
    plt.plot(x,_err_mud[:-1], label='mud', c='k', lw=10)
    plt.plot(x,_err_map[:-1], label='map', c='r', ls='--', lw=5)
    plt.plot(x,_err_pin[:-1], label='lsq', c='cyan', ls='--', lw=5)
plt.plot(x,regression, c='g', ls='-')
# plt.xlim(0,dim_output)
plt.title("Convergence for Various $\Sigma_{init} = \sigma I$", fontsize=1.25*fsize)
# plt.yscale('log')
# plt.xscale('log')
plt.ylim(0, 7)
# plt.ylim(1E-4, 5E-2)
plt.ylabel("$||\lambda_{ref} - \lambda||$", fontsize=fsize)
plt.xlabel('Dimension of Output Space', fontsize=fsize)
plt.legend(['mud', 'map', 'least squares'], fontsize=fsize)
plt.annotate(f'Slope={slope:1.4f}', (4,4), fontsize=24)
plt.savefig(f'{prefix}-convergence-dimension.pdf', bbox_inches='tight')
plt.show()

In [None]:
print(c, slope)

#### Observations
These results appear to hold for random $A$'s generated with uniform and normal distributions, and even hold for $A=I$ (which actually fairs worse for the MAP solution, as do orthogonal maps)

In [None]:
for idx, std in enumerate(std_list):
    _err_mud = err_Amud_list[idx]
    _err_map = err_Amap_list[idx]
    _err_pin = err_Apin_list[idx]
    
    plt.plot(np.arange(0, 1+dim_output),_err_mud[:], label='mud', c='k', lw=10)
    plt.plot(np.arange(0, 1+dim_output),_err_map[:], label='map', c='r', ls='--', lw=5)
    plt.plot(np.arange(0, 1+dim_output),_err_pin[:], label='lsq', c='cyan', ls='--', lw=5)
# plt.plot(x,regression, c='g', ls='-')
# plt.xlim(0,dim_output)
plt.title("Convergence for Various $\Sigma_{init} = \sigma I$", fontsize=1.25*fsize)
# plt.yscale('log')
# plt.xscale('log')
# plt.ylim(0, 6)
# plt.ylim(1E-4, 5E-2)
plt.ylabel("$\\frac{||A (\lambda_{ref} - \lambda) ||}{||A||}$", fontsize=fsize)
plt.xlabel('Dimension of Output Space', fontsize=fsize)
# plt.legend(['mud', 'map', 'least squares'], fontsize=fsize)
# plt.annotate(f'Slope={slope:1.4f}', (4,4), fontsize=24)
plt.savefig(f'{prefix}-convergence-dimension-out.pdf', bbox_inches='tight')
plt.show()

In [None]:
pin_mud_mismatch = np.vstack( [ [np.linalg.norm(_err_pin[n] - _err_mud[n]) for n in range(dim_input)] for _err_pin, _err_mud in zip(err_pin_list, err_mud_list)])
plt.plot(x, pin_mud_mismatch.T, c='k')
plt.xlabel('Dimension', fontsize=fsize)
plt.yscale('log')
# plt.xscale('log')
plt.ylabel('$||\lambda_{mud} - \lambda_{lsq}||$', fontsize=fsize)
plt.title("MUD $\\approx$ Least Squares", fontsize=1.25*fsize)
plt.savefig(f"{prefix}-mud-leastsquares-error.pdf")
plt.show()

---
---

In [None]:
assert 1 == 0

In [None]:
sol = []
tol = 0.1
std = std_from_equipment(tol)
std = 1
for o in range(dim_output):
    _A = A[0:o, :]
    _b = b[0:o,:]
    _y = y[0:o,:]
    _mud = mud_sol(_A, _b, _y, initial_mean, std**2*np.eye(dim_input))
    _map = map_sol(_A, _b, _y, initial_mean, std**2*np.eye(dim_input))
    sol.append((_mud, _map))

err_mud = [np.linalg.norm(_m[0] - lam_ref) for _m in sol] 
err_map = [np.linalg.norm(_m[1] - lam_ref) for _m in sol]

In [None]:
plt.plot(err_mud, label='mud', c='k')
plt.plot(err_map, label='map', c='r', ls='--')
plt.title(f"$L^2$ Convergence for $\sigma = {std:1.4E}$")
plt.yscale('log')
# plt.xscale('log')
plt.xlabel('Dimension')
plt.legend()
plt.show()

---
---

In [None]:
S = 100
lam_true = np.array([[0.5, 0.5]]).reshape(-1,1)
M, data = createRandomLinearPair(lam_true, num_observations=100, std=0.001, repeated=True)

In [None]:
M.shape, data.shape

In [None]:
np.mean(np.abs(M@lam_true - data)), np.linalg.norm(M@lam_true - data)

In [None]:
plt.plot(np.arange(S), M@lam_true, c='r')
plt.plot(np.arange(S), data)
plt.plot(np.arange(S), np.mean(data)*np.ones(S), '--', c='b')

In [None]:
A, b = transform_linear_map(M, data, 0.001)
print(S, A, b)
np.mean(np.abs(A@lam_true + b)), np.linalg.norm(A@lam_true + b)

In [None]:
num_qoi = 1
num_obs = 1000
num_trials = 500
predictions = []
for _ in range(num_trials):
    operator_list, data_list, std_list = createRandomLinearProblem([0.5, 0.5], num_qoi, num_obs, 0.001, repeated=True)
    A, b = transform_setup(operator_list, data_list, std_list)
    predictions.append(A@lam_true + b)
samples = [p[0,0] for p in predictions]

In [None]:
plt.hist(samples, 20, density=True)
x = np.linspace(-3,3,1000)
y = stats.norm.pdf(x)
plt.title(f"Predictions from {num_trials} Random Linear Problems")
plt.plot(x,y)
n = stats.normaltest(samples)
plt.annotate(f"Normal Test\n  Statistic: {n[0]:1.4f}\n  p-value : {n[1]:1.4f}", (-3.25,0.3))
plt.show()


---

# Sensitivity - Different Noise

In [None]:
dim_input = 2
num_qoi = 1 # fix at 1 for this example
num_obs = 1000
num_trials = 100
predictions = []
std = 0.001
std_list = [std]*num_obs
reference_point = [0.5, 0.5]
operator_list = [createRandomLinearMap(dim_input, num_obs, repeated=True) for _ in range(num_qoi)]
for _ in range(num_trials):    
    data_list = [createNoisyReferenceData(M, reference_point, std) for M in operator_list]
    A, b = transform_setup(operator_list, data_list, std_list)
    predictions.append(A@lam_true + b)
samples = [p[0,0] for p in predictions]

In [None]:
plt.hist(samples, 20, density=True)
x = np.linspace(-3,3,1000)
y = stats.norm.pdf(x)
plt.title(f"Predictions from {num_trials} Data Streams for a Random Linear Map")
plt.plot(x,y)
n = stats.normaltest(samples)
plt.annotate(f"Normal Test\n  Statistic: {n[0]:1.4f}\n  p-value : {n[1]:1.4f}", (-3.25,0.3))
plt.show()

---

# Define Inputs/Outputs to Model

In [None]:
lam_true = [0.5, 0.5]
dim_input = 2
num_observations = 100
# M = createRandomLinearMap(dim_input, num_observations, repeated=True)
sigma = std_from_equipment(tolerance=0.1, probability=0.99)
M, data = createRandomLinearPair(lam_true, num_observations, sigma, repeated=True)
def makeLinearModel(M):
    num_observations = M.shape[0]
    def model(lam = np.array([lam_true]) ):
        response     = (M@lam.T).T
        if response.shape[0] == 1:
            return response.ravel() # this allows support for simpler 1D plotting.
        else:
            return response
    return model

In [None]:
model = makeLinearModel(M)

In [None]:
model() - data.ravel()

In [None]:
model().shape

In [None]:
dim_input = 100
reference_point = np.random.rand(dim_input).reshape(-1,1)
num_qoi = 100
num_observations_list = [1000]*num_qoi
std_list = [sigma]*num_qoi
initial_mean = np.zeros(dim_input).reshape(-1,1)
initial_cov = np.eye(dim_input)

In [None]:
operator_list, data_list, std_list = createRandomLinearProblem(reference_point, num_qoi,
                                      num_observations_list, std_list,
                                      dist='normal', repeated=False)

In [None]:
A, b = transform_setup(operator_list, data_list, std_list)
pred_sol = (A@reference_point + b).ravel()

In [None]:
mud_pt = mud_sol(A,b, initial_mean, initial_cov)

In [None]:
np.linalg.norm(mud_pt - reference_point)

In [None]:
plt.hist(pred_sol, density=True)
plt.show()

In [None]:
pred_sol.mean(), pred_sol.var()

In [None]:
pred_sol

---

# Define Measurements / Reference Solution

In [None]:
##### FIXED PARAMETERS - DEFINE YMUR EXPERIMENT #####
sigma      = 0.001

################
################
model           = makeLinearModel(M)
qoi_true        = model() # no args evaluates true param
sigma2          = sigma**2 # fixed noise level in the data
# d             = createNoisyReferenceData(M, lam_true, sigma)
####

---

## Create input / output sets

We fix our exploratory samples of the parameter space $\Lambda$ for all experiments.


In [None]:
num_samples = 1E4
lam = np.random.rand(int(num_samples),2)
a = np.argsort(lam.ravel())
qoi = model(lam)

In [None]:
qoi.shape

---
---

# Solve Inverse Problem

In [None]:
import bet.sample as samp

In [None]:
def mud_problem(lam, qoi, sd=sigma, num_obs=None, qoi_true=qoi_true):
    try:
        dim_input = lam.shape[1]
    except IndexError:
        dim_input = 1

    try:
        dim_output = qoi.shape[1]
    except IndexError:
        dim_output = 1

    if num_obs is None:
        num_obs = dim_output
    elif num_obs < 1:
        raise ValueError("num_obs must be >= 1")
    elif num_obs > dim_output:
        raise ValueError("num_obs must be <= dim(qoi)")

    i_set = samp.sample_set(dim_input)
    i_set.set_domain(np.array([[0,1]*dim_input]))
    
    i_set.set_values(lam)
    o_set = samp.sample_set(dim_output)
    o_set.set_values(qoi)
    d = samp.discretization(i_set, o_set)
    data = qoi_true[0:num_obs] + np.random.randn(num_obs)*sd
    
    # TMDM generalize
    d.set_initial(dist=sp.stats.distributions.uniform(loc=0,scale=1), gen=False)
    # needed if changing dimensions around until fix is made in BET
#     d._output_probability_set = None # will throw warning
    d._output_probability_set = samp.sample_set(num_obs)
    d.data_driven(data=data, std=sd, inds=list(range(0,num_obs)))
    return d

---

---

# What happens as we take more observations?

## Make MUD solutions for successive inclusions of measurements

We take repeated trials (draws of noise polluting our data) to study the sensitivity to individual experiments as a function of number of observations.

In [None]:
%%time
num_sensor_list = np.arange(num_observations) + 1
experiments = {}
solutions = {}
num_trials = 5 # realizations of synthetic data
# num_sensors_plot_conv = num_sensor_list[4::5]
num_sensors_plot_conv = [1, 5, 10, 25, 50, 100]
for ns in num_sensors_plot_conv:
    discretizations = []
    mud_solutions = []
    for t in range(num_trials):
        np.random.seed(21+t)
        _d = mud_problem(lam, qoi, sd=sigma, num_obs=ns)
        discretizations.append(_d)
        mud_solutions.append(_d.mud_point())
    experiments[ns] = discretizations
    solutions[ns] = mud_solutions

## extract means and variances from repeated trials

In [None]:
means = []
variances = []
for ns in num_sensors_plot_conv:
    mud_solutions = solutions[ns]
    discretizations = experiments[ns]
    _data = np.array([ _d.get_data() for _d in discretizations])
    err = np.abs((M@np.array(mud_solutions).T).T[:,0] - (M@lam_true)[0]) # truth
    mean_mud_sol = np.mean(err)
    var_mud_sol = np.var(err)
    means.append(mean_mud_sol)
    variances.append(var_mud_sol)

## How does the accuracy + precision change?

In [None]:
plt.plot(num_sensors_plot_conv, means, label='mean', c='xkcd:blue')
plt.plot(num_sensors_plot_conv, variances, label='variance', c='xkcd:red')
plt.plot(num_sensors_plot_conv, 0.01*np.power(np.array(num_sensors_plot_conv), -1/2)/10, label='~ $N^{-1/2}$', ls='--', c='xkcd:blue')
plt.plot(num_sensors_plot_conv, 1E-7/np.array(num_sensors_plot_conv), label='~ $N^{-1}$', ls='--', c='xkcd:red')

plt.legend()
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Number of Measurements')
plt.ylabel('Mean Normed Error in Prediction')
plt.title(f"Convergence for N={lam.shape[0]} parameter samples")
plt.savefig('lin_convergence_mud_obs.pdf')
plt.show()

In [None]:
means

---

## Measurement Error

Fixed number of sensors, varying the quality of equipment.


In [None]:
%%time
sd_err = []
sd_var = []
sd_vals = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05]
num_trials = 10
num_obs_meas = 10
for sd in sd_vals:
    temp_err = []
    for t in range(num_trials):
        d = mud_problem(lam, qoi, sd=sd, qoi_true=qoi_true, num_obs=num_obs_meas)
        mud_point = d.mud_point()
#         temp_err.append(np.abs((M@mud_point)[0] - (M@lam_true)[0])) # truth
        temp_err.append(np.abs((M@mud_point)[0] - d.get_data())) # observed
    sd_err.append(np.mean(temp_err))
    sd_var.append(np.var(temp_err))

In [None]:
plt.plot(sd_vals, sd_err, label='mean', c='xkcd:red')
plt.plot(sd_vals, sd_var, label='variance', c='xkcd:blue')
plt.plot(sd_vals, np.power(np.array(sd_vals), 2), label='$\sigma^2$', ls='--', c='xkcd:blue')
plt.plot(sd_vals, np.power(np.array(sd_vals), 1), label='$\sigma$', ls='--',  c='xkcd:red')
plt.legend()
plt.yscale('log')
plt.xscale('log')
plt.ylabel("Absolute Error")
plt.xlabel("Standard deviation")
plt.title(f"Impact of Measurement Noise on MUD Error for S={num_obs_meas}")
plt.savefig('lin_convergence_mud_std.pdf')
plt.show()

---