In [None]:
%matplotlib inline

In [None]:
from src.data import open_data

In [None]:
nudging = open_data('nudge')
training = open_data('training')

In [None]:
nudge_3d = xr.open_mfdataset(nudging.files_3d, drop_variables=['p'])

In [None]:
train_3d = training.sel(time=nudge_3d.time)

In [None]:
train_3d.QT[100,10].plot()

In [None]:
nudge_3d.QT[100,10].plot()

In [None]:
def stack_dims_and_concat_feats(ds, variables, sample_dims=('x', 'y', 'time'), feature_dims=('z')):
    """Convert certain variables of a data frame into 2D numpy arrays"""
    
    # convert tuple args to lists
    sample_dims = list(sample_dims)
    feature_dims = list(feature_dims)
    
    flat_arrays = []
    for name in variables:
        da = ds[name]
        # for two-d variables insert a singleton "z" dimension
        if 'z' not in da.dims:
            da = da.expand_dims('z')
        stacked_da = da.stack(samples=sample_dims, features=feature_dims)
        # make sure the rows are samples and columns are features
        tranposed_da = stacked_da.transpose('samples', 'features')
        flat_arrays.append(tranposed_da.values)

    # concatenate along the final dimension
    # also outuput coord info for later use

    return np.concatenate(flat_arrays, axis=1), stacked_da.samples

In [None]:
t = -10

x,_ = stack_dims_and_concat_feats(nudge_3d.isel(time=t), ['QT'], sample_dims=['x', 'y'])
y,_ = stack_dims_and_concat_feats(train_3d.isel(time=t), ['QT'], sample_dims=['x', 'y'])

In [None]:
plt.scatter(x[:,10], y[:,10], alpha=.02)
plt.plot([0, 10],[0,10], 'k')

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import r2_score

In [None]:
model = TransformedTargetRegressor(
    regressor=make_pipeline(StandardScaler(), Lasso(.1)),
    transformer=StandardScaler())
model.fit(x, y)

In [None]:
y_pred = model.predict(x)

In [None]:
r2_score(y[:,10], y_pred[:,10])

In [None]:
lasso = model.regressor_.named_steps['lasso']
plt.pcolormesh(lasso.coef_)
plt.colorbar()

In [None]:
r2_score(y[:,10], x[:,10])

# Model of noise

In [None]:
X = np.concatenate([x, y], axis=1)
cov = np.cov(X.T)
precision = np.linalg.pinv(cov)

In [None]:
plt.pcolormesh(np.linalg.inv(precision[:22,:22]))
plt.colorbar()

this is the noise covariance I will have to use.

In [None]:
cov = np.cov((x-y).T)

In [None]:
plt.pcolormesh(cov)
plt.colorbar()

## generate one sample

In [None]:
Q = np.linalg.cholesky(cov)
n = Q.shape[0]

z = Q @ np.random.randn(n)

In [None]:
plt.plot(z)

## compare to old distribution

Generate random samples:

In [None]:
x_pert = x +  np.random.randn(x.shape[0], n) @ Q.T

Plot the distribution of QV at level 10

In [None]:
plt.hist(x[:,10], 100);
# plt.hist(x_pert[:,15], 100, alpha=.4);
plt.hist(y[:,10], 100, alpha=.4);

In [None]:
plt.hist(x[:,10], 100);

plt.hist(x_pert[:, 10], 100, alpha=.4);

## Distribution of PW

In [None]:
from uwnet.thermo import layer_mass_from_p

dm = layer_mass_from_p(open_data('pressure')).values

In [None]:
plt.hist(x.dot(dm), 100);
plt.hist(x_pert.dot(dm), 100, alpha=.5);

In [None]:
plt.hist(x.dot(dm), 100);
plt.hist(y.dot(dm), 100, alpha=.5);

Adding gaussian noise helps somewhat, but the right tail of PW is still much fatter with the true data. I think we will need a better model of the noise.

# Compare w 

In [None]:
nudge_3d.W.isel(y=32).mean(['x', 'time']).plot(label='Nudge')
train_3d.W.isel(y=32).mean(['x', 'time']).plot(label='Training')
plt.legend()

The average vertical velocity at the equator in the nudged simulation is so much weaker.

# Distribution of residual

In [None]:
pw_nudge = x.dot(dm)/1000
pw_true = y.dot(dm)/1000 

In [None]:
plt.hist((pw_true - pw_nudge), 100);

This distribution does seems pretty close to guassian, although it does have fatter tails.

In [None]:
pert = np.random.randn(x.shape[0], n) @ Q.T
pert_pw = pert.dot(dm)/1000
plt.hist(pert_pw, 100);

this actually seems like a prety good fit.