In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
from lib.models import weights_to_np, WeightedOutput, get_lrf, plot_lrf
from xnoah.data_matrix import stack_cat, unstack_cat, compute_weighted_scale
import os

from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Load data

In [None]:
def ngaqua(filename):
    """Get the files from the NgAqua data pipeline"""
    return os.path.join("../2017-09-28/ngaqua/", filename)


def get_data():
    X = xr.open_dataset(ngaqua("X.nc"))
    Y = xr.open_dataset(ngaqua("Y.nc"))
    w = xr.open_dataarray(ngaqua("w.nc"))
    p = xr.open_dataset(ngaqua("stat.nc")).p
    # train test split

    return X, Y, w, p


def train_test_split(X, t=50):
    return X.sel(time=slice(0, t)), X.sel(time=slice(t, None))
    

def prepvar(X):
    return stack_cat(X, "features", ["z"])\
                  .stack(samples=["x", "time", "y"])\
                  .transpose("samples", "features")

def get_ml_data():
    t_split = 50.0
    X, Y, w, p  = get_data()
    X_train, X_test = map(prepvar, train_test_split(X, t=t_split))
    Y_train, Y_test = map(prepvar, train_test_split(Y, t=t_split))
    w_output = weights_to_np(w, Y_test.features)
    
    return (X_train, X_test), (Y_train, Y_test), w_output, p

In [None]:
(X_train, X_test), (Y_train, Y_test), w_output, p = get_ml_data()

# Perform Ridge Regression

In [None]:
pipe = WeightedOutput(make_pipeline(VarianceThreshold(.001), Ridge(.1, normalize=True)), w_output,
                     nfit=100000)
pipe.fit(X_train, Y_train)
pipe.score(X_test, Y_test)

In [None]:
lrf = get_lrf(pipe, X_train, Y_train)
plot_lrf(lrf, p,'QT SL LHF SHF'.split(' '), 'Q1 Q2'.split(' '));

## Cross validation

In [None]:
param_grid = dict(ridge__alpha=np.logspace(-10, 3, 8), ridge__normalize=[True, False])

cv = []

for param in ParameterGrid(param_grid):
    print(f"Cross validating {param}")
    pipe.set_params(**param)
    pipe.fit(X_train, Y_train)
    score = pipe.score(X_test, Y_test)
    print(f"score={score}")
    cv.append({'score': score, 'param': param})


In [None]:


df = pd.DataFrame([(p['param']['ridge__alpha'], p['param']['ridge__normalize'], p['score']) for p in cv],
                   columns=['alpha', 'normalized', 'score'])

In [None]:
fg =sns.FacetGrid(df, hue="normalized", aspect=1.61) 
fg.map(plt.plot, "alpha", "score")
fg.add_legend()
plt.gca().set_xscale('log')

In [None]:
df.iloc[df.score.argmax(),:]

## Chosen scheme

Given my experience with the last dataset, I worry about the stability of these schemes, so I am just going to use $\alpha=10^{-4}$ and normalized = False.

In [None]:
pipe.set_params(ridge__alpha=1e-6, ridge__normalize=False)
pipe.fit(X_train, Y_train)
pipe.score(X_test, Y_test)

In [None]:
lrf = get_lrf(pipe, X_train, Y_train)
plot_lrf(lrf, p,'QT SL LHF SHF'.split(' '), 'Q1 Q2'.split(' '));

# Prettify lrf


Let's prettify this by projectiving it onto the EOFs of data.

In [None]:
from sklearn.decomposition import PCA

unstack_cat(X_train, "features") * w_output