In [None]:
%pylab inline

In [None]:
import scipy.stats as ss
import xarray as xr
from gnl.plots import loghist
from gnl.xarray import xr2mat

from sklearn.linear_model import LogisticRegression

In [None]:
D    = xr.open_mfdataset([f"../2017-08-12/wd/calc/{f}.nc" for f in ['qt', 'sl']] )
Prec = xr.open_dataarray("../2017-08-12/wd/A64/2d/Prec.nc")\
         .reindex_like(D)

In [None]:
X,_ = xr2mat(D, ['x', 'time'], ['z'])
y = Prec.stack(samples=["x", "time"]) > .001

In [None]:
t_split = 50

Xtrain = X.sel(time=slice(40,50)).values
ytrain = y.sel(time=slice(40,50)).values

Xtest = X.sel(time=slice(50,None)).values
ytest = y.sel(time=slice(50,None)).values

# subindices for faster training
inds = np.random.choice(Xtrain.shape[0], 10000)

Around 50% of the spatio temporal samples are less than .001 mm/day

# Logistic Regression

In [None]:
mod = LogisticRegression()

mod.fit(Xtrain, ytrain)

The performance is pretty good

In [None]:
mod.score(Xtest, ytest)

In [None]:

for C in [.0001, .001, .01, .1, 1.0,10.0,100.0]:
    mod.set_params(C=C)
    mod.fit(Xtrain, ytrain)
    score = mod.score(Xtest, ytest)
    print(f"C = {C}, score = {score}")

The best testing performance is around 82% accuracy for the logistic model

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(4)
pca.fit(Xtrain)


ztrain = pca.transform(Xtrain)

mod = RandomForestClassifier(n_estimators=20)
mod.fit(ztrain, ytrain)

mod.score(pca.transform(Xtest), ytest)

Random forests really benefit from transforming the data first using PCA. The classification error is about 60% without this. I think random forests are too sensitive to basis used to describe the data.

# SVM

In [None]:
from sklearn.svm import SVC

SVM is pretty slow for large numbers of samples. This is because its cost scales quadratically with the number of samples, it needs to copmute a pairwise distance matrix between each sample.

In [None]:
mod =  SVC()
mod.fit(Xtrain[:1000], ytrain[:1000])

mod.score(Xtest, ytest)

# Neural netowrk

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:

nlayers = 3
nhidden = 10

for alpha in [.0001, .001, .01, .1, 1, 2]:
    

    net = MLPClassifier(hidden_layer_sizes=[nhidden]*nlayers, alpha=alpha)
    net.fit(Xtrain[inds], ytrain[inds])
    score = net.score(Xtest, ytest)
    print(f"alpha = {alpha}, score = {score}")

It seems like 81-82% test-accuracy is the best performance we are getting from any of these methods.

Let's look at the structures the neural network has learned. These are the first layers weights.

In [None]:
plt.pcolormesh(net.coefs_[0].T)

These look similar to the vertical modes generated by the PCA analysis

In [None]:
plt.pcolormesh(pca.components_)

**Key question**: How does this compare to the trigger of a traditional deep convection scheme?

Also, is the neural network more robust to using a wide variety of training data. This is what I would expect.