In [None]:
import xarray as xr
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
import matplotlib.pyplot as plt
from matplotlib.colors import SymLogNorm
from matplotlib.mlab import griddata
%matplotlib inline

In [None]:
mem = joblib.Memory("../data/cache/")

Let's load the transformed data from the mca analysis.

In [None]:
io_data = joblib.load("../data/ml/ngaqua/data.pkl")
# lm_data = joblib.load("../data/ml/ngaqua/linear_model.pkl")
mca_data = joblib.load("../data/ml/ngaqua/mca.pkl")

In [None]:
x,y = mca_data['transformed']

# scale these to have the same standard deviation
x = StandardScaler().fit_transform(x)
y = StandardScaler().fit_transform(y)


n_samp, n_comp= x.shape
x.shape

the rows of these matrices corresponds to all horizontal-temporal samples of the data. Let's plot the various pairwise probability density functions of the input (QT, SL ,LHF, SHF) and output (Q1c, Q2) MCA modes.

In [None]:
def cross_hexbin_matrix(fun, x, y,axs=None, **kwargs):
    n_samp, n_comp_x = x.shape
    n_comp_y  = y.shape[1]
    
    if x.shape[0] != y.shape[0]:
        raise ValueError("x and y must have same number of rows")
    
    if axs is None:
        fig, axs = plt.subplots(n_comp_y, n_comp_x, figsize=(2*n_comp_x, 2*n_comp_y), sharex=True, sharey=True)

    axs.shape = (n_comp_y, n_comp_x)
    
    ims = []
    for i in range(n_comp_y):
        for j in range(n_comp_x):
#             axs[i,j].scatter(x[idx_rand,i], y[idx_rand,j], **kwargs)
            im = fun(axs[i,j], x[:,j], y[:,i])
#             im = axs[i,j].hexbin(x[:,i], y[:,j], cmap='inferno_r')
            axs[i,j].set_ylim([-3,3])
            axs[i,j].set_xlim([-3,3])
            
            ims.append(im)
            
            if i == 0:
                axs[i,j].set_title(f"X Comp {j+1}")
            if i == n_comp_y:
                axs[i,j].set_xlabel(f"X Comp {j+1}")
            if j == 0:
                axs[i,j].set_ylabel(f"Y Comp {i+1}")
                
    return axs, ims


def myhexbin(ax, x, y):
    return ax.hexbin(x, y, cmap='inferno_r')


def myline(ax, x, y):
    inds = x.argsort()
    x, y = x[inds], y[inds]
    
    b,e = x[0], x[-1]
    xg = np.linspace(b, e, 200)
    
    yg = np.interp(xg, x, y)
    
    return ax.plot(xg, yg)            

In [None]:
axs, ims = cross_hexbin_matrix(myhexbin, x, y)
plt.tight_layout()
plt.colorbar(ims[0], ax=list(axs.flat), fraction=.05)

From this we can see that there is a significant nonlinear structure to the relationship between the input and output MCA components. This nonlinearity is particularly obvious when looking at the scatter plot between the first output MCA component and the various input components.

In addition to this nonlinearity, the blobiness of these two dimensional plots shows that the data are quite noisy.

How well can a linear model fit this data?

In [None]:
from sklearn.linear_model import LinearRegression

for comp in range(4):
    lm =  LinearRegression().fit(x,y[:,comp])
    pred= lm.predict(x)
    score = lm.score(x, y[:,comp])
    print(f"Component {comp+1} R2: {score}")

The R2 score of this data is not good at all. What do the pairwise pdf plots of $(x, \tilde{y})$, where $\tilde{y}$ is the predicted value of y, for the overall data look like?

In [None]:
pred = LinearRegression().fit(x,y).predict(x)
cross_hexbin_matrix(myhexbin, x, pred);

As you can see, this plot looks almost completely different from the pdf plots above. For example, the top-left pane of both plots shows the joint distribution of the first input and output components. While the linear model can capture some curvature in these pairwise pdfs, it does not come close to appoximating the nonlinearity of the actual MCA components.

# Nonlinear models for the first mode

Let's focus our analysis on just predicting the first MCA mode given the four input variables.

In [None]:
y0 = y[:,0]

As we saw before, the R2 for a linear fit is pretty low:

In [None]:
from sklearn.metrics import r2_score

def plot_prediction(true, y):
    """Plot the prediction"""
    plt.hexbin(y, true, cmap='inferno_r', norm=SymLogNorm(100))
    plt.colorbar()
    plt.plot((-3,3), (-3,3), 'k-')
    plt.xlim([-3,3])
    plt.ylim([-3,3])
    
    plt.xlabel('Prediction')
    plt.ylabel('Observed')
    
def plot_performance(y_true, y_pred):
#     pred = mod.predict(x)
    plot_prediction(y_true, y_pred)
    score = r2_score(y_true, y_pred)
    plt.title(f"R2 = {score}")

    
def marginal_prediction_comp_1(mod):
    """preduce predictions varying just the first feature input"""
    
    def f(x):
        return np.array([x, 0, 0, 0])

    xg = np.linspace(-3,3, 101)
    x_comp_1 = np.vstack(f(x) for x in xg)
    
    return xg, rf.predict(x_comp_1)



Here I plot the joint pdf of the prediction value of component 0 vs the actual value. I have used a non-uniform colorbar so we can see both the bulk of the distribution and it's tail. For reference, I have included a line with a slope of 1. Ideally the pdf would tightly cluster around this line.

In [None]:
plot_performance(y0, 
                 LinearRegression().fit(x,y0).predict(x))

As we can see, the linear prediction is not good at all.

## Random Forests

In [None]:
from sklearn.ensemble import RandomForestRegressor


rf = mem.cache(RandomForestRegressor().fit)(x,y0)
rf_pred = rf.predict(x)

In [None]:
plot_performance(y0, rf_pred)

Random forest seems to do much better, but I am a little concerned about overfitting.

In [None]:
cross_hexbin_matrix(myhexbin, x, rf_pred[:,None]);

This looks very similar to the first row of the scatter plot matrix.

Let's now look at the what the output looks like when the components 2-4 are fixed at 0 and the component 1 is varied.

In [None]:
plt.plot(*marginal_prediction_comp_1(rf))

As we can see the fit is pretty noisy, but captures the general features of the data very well.

## MARS

In [None]:
from pyearth import Earth

this code is to slow to use all the samples, so I have to subsample they data

In [None]:
idx_rand = np.random.choice(x.shape[0], 100000, replace=False)

In [None]:
mars = Earth()
mars.fit(x[idx_rand], y0[idx_rand])

In [None]:
plot_performance(y0, mars.predict(x))

I am not sure if I am tuning this incorrectly somehow, I am a bit surprised the performance is so bad. Actually these errors are from only training on a subset of the data. Here is the error plot for RandomForest trained with just the small subset:

In [None]:
rf = mem.cache(RandomForestRegressor().fit)(x[idx_rand],y0[idx_rand])
rf_pred = rf.predict(x)
plot_performance(y0, rf_pred)

## Neural networks

Finally, we get to NN

In [None]:
from sklearn.neural_network import MLPRegressor

nn = MLPRegressor(hidden_layer_sizes=(10,10,10), activation='relu', batch_size=10000)
nn = mem.cache(nn.fit)(x, y0)

plot_performance(y0, nn.predict(x))

NN doesn't seem to perform that well even when trained with all the samples.

Does it work better when a larger batch size is used? It seems like the neuralk network is not looking at all of our data.

In [None]:
nn = MLPRegressor(hidden_layer_sizes=(10,10,10), activation='relu', batch_size=10000, max_iter=10000)
nn = mem.cache(nn.fit)(x, y0)

plot_performance(y0, nn.predict(x))

It does not seem to be getting better. Can this neural network implementation possibly be looking at all of the samples when it trains so quickly? Part of me suspects that the random forest must be overfitting the data.