In [None]:
# import holoviews as hv
# hv.extension('bokeh')

In [None]:
import numpy as np
import xarray as xr
from tqdm import  tqdm_notebook as tqdm

In [None]:
from toolz.curried import *
import holoviews as hv
hv.extension('bokeh')

In [None]:
from sklearn.pipeline import make_pipeline, make_union
from xnoah.sklearn import Normalizer, Stacker, Select, Weighter
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.externals import joblib

mem = joblib.Memory("../data/cache")

In [None]:
D = xr.open_mfdataset("../data/ngaqua/3d/*.nc")
D2= xr.open_mfdataset("../data/ngaqua/2d/*.nc")
w = xr.open_dataarray("../data/processed/ngaqua/w.nc")
p = xr.open_dataset("../data/ngaqua/stat.nc").p

D = D.merge(D2, join='inner')
D = D.assign(Q1c=D.Q1 - D.QRAD)


d_train, d_test= D.sel(time=slice(0,50)), D.sel(time=slice(50,None))

In [None]:
union = make_union(
    make_pipeline(Select('QT', sel={'z': slice(0,10e3)}), Normalizer(['x', 'y', 'time']), Stacker(['x', 'y', 'time'])),
    make_pipeline(Select('SL'), Normalizer(['x', 'y', 'time']), Stacker(['x', 'y', 'time'])),
    make_pipeline(Select('SHF'), Normalizer(['x', 'y', 'time']), Stacker(['x', 'y', 'time'])),
    make_pipeline(Select('LHF'), Normalizer(['x', 'y', 'time']), Stacker(['x', 'y', 'time'])),

)


output_union = make_union(
    make_pipeline(Select('Q1c'), Weighter(np.sqrt(w)), Stacker(['x', 'y', 'time'])),
    make_pipeline(Select('Q2'), Weighter(np.sqrt(w)), Stacker(['x', 'y', 'time']))
)


mod = make_pipeline(union, LinearRegression())

What is the condition number of the input matrix

In [None]:
design_mat = union.fit_transform(d_train)
np.linalg.cond(design_mat)

This low value shows that we can expect a linear regression to perform reasonably well. Now let's fit for this matrix

In [None]:
# x = union.fit_transform(D)
y_test = output_union.fit_transform(d_test)
y_train = output_union.fit_transform(d_train)

x_train = union.fit_transform(d_train)
x_test = union.fit_transform(d_test)

mod.fit(d_train, y_train)
mod.score(d_train, y_train)

%timeit LinearRegression().fit(x_train, y_train)

%timeit mod.fit(d_train, y_train)

In [None]:
mod.score(d_test, y_test)

The R2 performance on the testing portion is not as good as on the training portion

Instead of trying to fit all of y at once, let's fit an independent model for each output feature.

In [None]:
# fitted = [mod.fit(d_train, y) for y in  y_train.T]

from dask.delayed import delayed
from dask.distributed import Client

try:
    client
except NameError:
    client = Client()

In [None]:
@mem.cache(verbose=False)
def fit_score(mod, x_train, y_train, x_test, y_test):
    mod.fit(x_train, y_train)
    score = mod.score(x_test, y_test)
    
    return mod, score

@mem.cache
def fit_all_mods(mod, x_train, y_train, x_test, y_test):
    return [fit_score(mod, x_train, y, x_test, yt) for y, yt in tqdm(list(zip(y_train.T, y_test.T)))]

In [None]:
mods= fit_all_mods(LinearRegression(), x_train, y_train, x_test, y_test)

In [None]:
scores = np.array(list(map(nth(1), mods)))
p1, p2 = np.split(scores, 2)

In [None]:
%%opts Curve[invert_axes=True invert_yaxis=True width=250 ] Overlay[legend_position="top_left"]


def _mycurve(p, y, label=None):
    return hv.Curve((p, y), label=label, kdims=['p'], vdims=['R2'])

_mycurve(p,p1, label='Q1') * _mycurve(p,p2, label='Q2')

We can see the fit performs worst in the midtroposphere. This is probably due to the noiseness of deep convection compared to shallow and stratiform convection.

# Standard Deviations of Data

Is the linear model fit bad just because it is in regions with high variance? This appears to be the case.

In [None]:
sig = d_train.std(['x', 'y', 'time']).load()

In [None]:
hv.extension('matplotlib')

In [None]:
%%opts  Curve[invert_axes=True invert_yaxis=True shared_axes=False fontsize=fontsize aspect=.4] {+axiswise}

hv.NdLayout({k: hv.Curve((p, sig[k]), kdims=['p']) for k in sig.data_vars
           if  sig[k].ndim==1}, kdims=['sig']).cols(7)