# Mocap Initial Modelling

* Deterministic LDS
* Not doing delta: error accumulation is bad

In [1]:
# using Revise
using LinearAlgebra, Random
using StatsBase, Statistics
using Distributions, MultivariateStats   # Categorical, P(P)CA
using Quaternions    # For manipulating 3D Geometry
using MeshCat        # For web visualisation / animation
using PyPlot         # Plotting
using AxUtil         # Cayley, skew matrices
using Flux           # Optimisation
using DSP            # convolution / low-pass (MA) filter

# small utils libraries
using ProgressMeter, Formatting, ArgCheck
using DelimitedFiles, NPZ, BSON

In [2]:
function zero_grad!(P) 
    for x in P
        x.grad .= 0
    end
end

In [3]:
DIR_MOCAP_MTDS = "." 

# Data loading and transformation utils
include(joinpath(DIR_MOCAP_MTDS, "io.jl"))

# MeshCat skeleton visualisation tools
include(joinpath(DIR_MOCAP_MTDS, "mocap_viz.jl"))

# Data scaling utils
include(joinpath(DIR_MOCAP_MTDS, "util.jl"))
import .mocaputil: MyStandardScaler, scale_transform, invert
import .mocaputil: OutputDifferencer, difference_transform, fit_transform
import .mocaputil: no_pos, no_poscp

# Models: LDS
include(joinpath(DIR_MOCAP_MTDS, "models.jl"))
import .model: Astable

### Load in Data
See `2_Preprocess.ipynb`

**Note that in the current harddisk state, `edin_Ys.bson` was created with `include_ftcontact=false`**


In [4]:
database = "../data/mocap/edin-style-transfer/"
files_edin = [joinpath(database, f) for f in readdir(database)];
style_name_edin = [x[1] for x in match.(r"\.\./[a-z\-]+/[a-z\-]+/[a-z\-]+/([a-z]+)_.*", files_edin)];
styles = unique(style_name_edin)
styles_lkp = [findall(s .== style_name_edin) for s in styles];

In [5]:
Usraw = BSON.load("edin_Xs_30fps.bson")[:Xs];
Ysraw = BSON.load("edin_Ys_30fps.bson")[:Ys];

standardize_Y = fit(MyStandardScaler, reduce(vcat, Ysraw),  1)
standardize_U = fit(MyStandardScaler, reduce(vcat, Usraw),  1)

Ys = [scale_transform(standardize_Y, y[2:end, :] ) for y in Ysraw];  # (1-step ahead of u)
Us = [scale_transform(standardize_U, u[1:end-1,:]) for u in Usraw];  # (1-step behind y)

@assert (let c=cor(Usraw[1][1:end-1, :], Ysraw[1][2:end, :], dims=1); 
        !isapprox(maximum(abs.(c[.!isnan.(c)])), 1.0); end) "some input features "

In [6]:
# SENSE CHECK
# check that no bugs in constructing U, Y (i.e. esp that t's align and can predict U --> Y)
let c=cor(reduce(vcat, Us) |>f64, reduce(vcat, Ys) |> f64, dims=1)
    imshow(c, aspect="auto")
    nonan_c = c[.!isnan.(c)]
    title(format("max (abs) corrcoeff: {:.8f}", maximum(abs.(nonan_c))))
    flush(stdout)
    display(findmax(reshape(nonan_c, size(c, 1) - 2, size(c,2))))
    printfmtln("10th best result {:.5f}", reverse(sort(nonan_c))[10]) 
end

In [7]:
invert_output_tform(y, i) = invert(standardize_Y, y) |> yhat -> invert(dtforms[i], yhat)

we can reconstruct the original data via the commands:

    invert_output_tform(y, i)       # for (i,y) in enumerate(Ys)
    invert(standardize_X, x)        # for x in Xs
    
in the relevant array comprehensions.

In [7]:
?mocapio.construct_inputs

In [8]:
?mocapio.construct_outputs

In [102]:
# MAT.matwrite("angry1_UY.mat", Dict("cYs"=>cYs, "cUT"=>cUT, "version"=>"may23_1809"))

# Initialise LDS

## Understand PC distribution

#### Single-task Model

In [370]:
fig, axs = subplots(4,2,figsize=(5.5,6))

for (i, ixs) in enumerate(styles_lkp)
    cYs = reduce(vcat, Ys[ixs])
    pc_all = fit(PCA, cYs[:,4:63]', pratio=0.999)

    varexpl = cumsum(principalvars(pc_all))/tvar(pc_all)
    bd=findfirst.([varexpl .> x for x in [0.9,0.95,0.99]])
    axs[:][i].plot(1:length(varexpl), varexpl)
    axs[:][i].axhline(1, linestyle=":")
    for b in bd
        axs[:][i].plot([b,b], [varexpl[1], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
        axs[:][i].plot([.5, b], [varexpl[b], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
        axs[:][i].text(b+0.3,varexpl[1]+0.03, b)
    end
    axs[:][i].set_xlim(0.5,34.5); gca().set_ylim(varexpl[1],1.025);
    axs[:][i].xaxis.set_ticklabels([])
    axs[:][i].set_title(styles[i])
end
tight_layout()

#### Pooled Model

In [371]:
allE = reduce(vcat, Ys);
allE = convert(Matrix{Float32}, allE);

pc_all = fit(PCA, allE[:,4:63]', pratio=0.999)

varexpl = cumsum(principalvars(pc_all))/tvar(pc_all)
bd=findfirst.([varexpl .> x for x in [0.9,0.95,0.99]])
plot(1:length(varexpl), varexpl)
gca().axhline(1, linestyle=":")
for b in bd
    plot([b,b], [varexpl[1], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
    plot([.5, b], [varexpl[b], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
    text(b+0.3,varexpl[1]+0.03, b)
end
gca().set_xlim(0.5,37.5); gca().set_ylim(varexpl[1],1.025);
gcf().set_size_inches(3,2)

## LDS Initialisation

\begin{align}
\mathbf{x}_t &= A \mathbf{x}_{t-1} + B \mathbf{u}_t + \mathbf{b}\\
\mathbf{y}_t &= C \mathbf{x}_{t} + D \mathbf{u}_t + \mathbf{d}
\end{align}

#### Initialisation

Note in the below I use the SVD construction for PCA for convenience, but in general $Y$ is not centered ($\because$ centering is done over \emph{all} styles simultaneously; each individual will not be centered). Therefore, the below assumes this centering is done temporarily before the SVD.

* $C = U_{SVD}$, where $U_{SVD}$ are the prinicipal components of $Y$.
* $X = S_{SVD}V_{SVD}^{\mathsf{T}}$, where $S_{SVD}$, $V_{SVD}$ are the other matrices from the SVD.
* $X \approx \tilde{U}\tilde{B} \Rightarrow \tilde{B} = (\tilde{U}^{\mathsf{T}} \tilde{U})^{-1} \tilde{U}^{\mathsf{T}} X$ (Regression of $X$ on $U$). The permutation of $U$ and $B$ in the first equation follows because the obs are column-wise here, not row-wise.
    * Here, $\tilde{U} = \begin{bmatrix} U & \mathbf{1} \end{bmatrix}$, and hence $\tilde{B} = \begin{bmatrix} B & \mathbf{b} \end{bmatrix}$

## Single task LDS

In [8]:
style_ix = 1
cYs = Matrix(transpose(reduce(vcat, Ys[styles_lkp[style_ix]])))
cUs = reduce(vcat, Us[styles_lkp[style_ix]])
cUT = Matrix(cUs');
cN = size(cYs, 2)

In [9]:
rmse(Δ,scale=size(Δ, 1)) = sqrt(mean(x->x^2, Δ))*scale

In [10]:
# Simple baselines
baselines = Dict()
baselines["copy"] = rmse(cYs[:,2:end] - cYs[:,1:end-1])

# cUs_m1 = cUs[1:end-1,:];
CDd = cYs / [cUT; ones(1, cN)]
baselines["LR"] = rmse(cYs[:,:] - CDd * [cUT; ones(1, cN)])

In [12]:
perf_init

In [11]:
perf_init = [rmse(model.init_LDS_spectral(cYs, cUT, k)(cUT) - cYs) for k in 5:5:60];

In [51]:
perf_init = [rmse(model.init_LDS_spectral(cYs, cUT, k)(cUT) - cYs) for k in 5:5:60];

In [794]:
perf_init[4]

In [13]:
plot(5:5:60, perf_init); gcf().set_size_inches(3,2)
gca().axhline(baselines["copy"], linestyle=":")
gca().axhline(baselines["LR"], linestyle=":", color="red")

In [22]:
clds = model.init_LDS_spectral(cYs, cUT, 10; max_iter=1)

In [14]:
imshow(clds.C[:,1:2]'); gca().set_aspect("auto")

In [26]:
eigvals(Astable(clds))

In [27]:
fig, axs = subplots(1,2,figsize=(8,4))
axs[1].plot(model.state_rollout(clds, cUT[:,1:500])[2:5,:]')
axs[2].plot(model.state_rollout(clds, cUT[:,1:500])[6:10,:]')

In [762]:
fig, axs = subplots(1,2,figsize=(8,4))
axs[1].plot(model.state_rollout(clds, cUT[:,1:500])[1:5,:]')
axs[2].plot(model.state_rollout(clds, cUT[:,1:500])[6:10,:]')

In [619]:
perf_init

In [769]:
Uouch = reduce(vcat, [cUT[:,i:N-4+i] for i in 1:4])
Youch = reduce(vcat, [cYs[:,i:N-4+i] for i in 1:4]

In [775]:
pUOuch = pinv(Uouch)

In [785]:
perp = Youch - (Youch * pUOuch) * Uouch

In [788]:
semilogy(svd(perp).S)

In [None]:
Youch - Youch*pUOuch

In [882]:
clds.a[1:20]

In [884]:
alt_clds = model.init_LDS_spectral(cYs, cUT, k; max_iter=5);

In [995]:
clds.a[1:20]

In [476]:
clds_save = copy(clds);

In [817]:
size(cYs, 2)/256

### Testing structure of A in cood descent

In [84]:
refA = diagm(1=>ones(9)*0.7, 2=>ones(8)*0.5, 3=>ones(7)*0.3)
refA += refA'
refA[diagind(refA)] .= 1

# alt_clds = model.init_LDS_spectral(cYs, cUT, 10; max_iter=4, λ=1f-4, return_hist=false);
_A = model.Astable(alt_clds)

@time h = begin
    n_epochs = 20000
    history = zeros(n_epochs)
    
    opt = ADAM(1e-3)
    q = Flux.param(zeros(45))
    for ee in 1:n_epochs
        W = AxUtil.Math.cayley_orthog(q, 10)
        obj = sum(abs.(W * _A - refA))
        history[ee] = obj.data
        Tracker.back!(obj)
        Tracker.update!(opt, q, -Tracker.grad(q))
        ee % 2000 == 0 && println(history[ee])
    end
    history
end;

In [86]:
fig, axs = subplots(1,2,figsize=(8,4))
tmp = axs[1].imshow(AxUtil.Math.cayley_orthog(q.data, 10) * _A)
colorbar(tmp)
axs[2].plot(svd(_A).S)
axs[1].set_xlabel("(a)")
axs[2].set_xlabel("(b)")
tight_layout()
# savefig("LDS_A_structure_2.pdf")

In [1087]:
_A^100

In [1030]:
eigvals(_A)

In [1045]:
svd(_A'*_A).S

In [1115]:
(256*(10-1)+1):min(256*10, N)

In [1123]:
svd(model.Astable(clds)).S

In [1146]:
_B

In [1145]:
α = std(clds.C, dims=1)

In [1143]:
1 ./ std(clds.C, dims=1)

In [1144]:
rmse(clds(cUT) - cYs)

In [1155]:
AxUtil.Math.cayley_orthog(clds.a[11:end], 10)

In [1185]:
eigvals(model.Astable(clds))

In [1181]:
plot(clds.a[11:end])

In [1183]:
AxUtil.Math.inverse_cayley_orthog((α[:] .* AxUtil.Math.cayley_orthog(clds.a[11:end], 10) ./  α[:]')) |> plot

In [62]:
perf_init[8]

In [70]:
rmse(model.init_LDS_spectral(cYs, cUT, 40; max_iter=4, λ=1f-4)(cUT) - cYs)

In [72]:
alt_model = model.init_LDS_spectral(cYs, cUT, 10; max_iter=4, λ=1f-4)

In [76]:
imshow(model.Astable(alt_model))

In [73]:
k = 40
clds_orig = model.init_LDS_spectral(cYs, cUT, k; max_iter=5, λ=1f-4);
# clds_orig.a[1:k] .= atanh(0.90)
# clds_orig.a[k+1:end] .= tanh.(clds_orig.a[k+1:end])
# model.fit_optimal_obs_params(clds_orig, cYs, cUT);

In [74]:
# clds_g = model.make_grad(model.init_LDS_spectral(cYs, cUT, k))
clds_g = model.make_grad(clds_orig)
clds   = model.make_nograd(clds_g)   # MUST DO THIS SECOND, (Flux.param takes copy)

opt = ADAM(1e-4)
opt_hidden = ADAM(0.7e-5)

In [75]:
opt_hidden = ADAM(0.7e-5)
ps_hidden = Flux.params(clds_g.a, clds_g.B, clds_g.b)
ps_observ = Flux.params(clds_g.C, clds_g.D, clds_g.d)

In [81]:
rmse(clds(cUT) - cYs)

In [80]:
model.fit_optimal_obs_params(clds, cYs, cUT);

In [69]:
let tmpr1 = mapslices(z->sqrt(mean(x->x^2, z)), cYs[:,1:end] - clds(cUT[:,1:end]), dims=1)[1:end]
    plot(conv(f64(tmpr1), Windows.rect(30))[:31:end-30])
end
title("error over time")

In [19]:
plot(clds.a[41:end])

In [87]:
baselines["LR"]/8

In [88]:
5e-4 / 10

In [89]:
0.02e-4 * 10 * 0.005 * 3

In [84]:
@time h = begin
    n_epochs = 100
    history = zeros(n_epochs*28)
    N = size(cYs, 2)
    W = [min(256*tt, N) - 256*(tt-1) for tt in 1:28]/256
    
    for ee in 1:n_epochs
        if ee == 1
            opt.eta, opt_hidden.eta = 0., 0.
        elseif ee == 3
            opt.eta = 5e-4 / 10
            opt_hidden.eta = 0.02e-4 * 10 * 0.005 * 3
        elseif ee % 100 == 0
#             opt.eta /= 1.5
            printfmtln("Changed learning rate!")
        end
        batch_order = 1:28   # => now retaining state #randperm(58)
        for tt in batch_order
            ixs = (256*(tt-1)+1):min(256*tt, N)
            _cY, _cU = cYs[:,ixs[1:end]], cUT[:, ixs[1:end]]
#             Yhat = clds_g(_cU)
            X̂ = model.state_rollout(clds_g, _cU); 
            Yhat = clds_g.C * X̂ + clds_g.D * _cU .+ clds_g.d;
            obj = mean(x->x^2, _cY - Yhat)*64^2
            history[(ee-1)*28 + tt] = obj.data
            Tracker.back!(obj)
#             display(Tracker.grad(clds_g.a))
            for p in ps_hidden
                Tracker.update!(opt_hidden, p, -Tracker.grad(p))
            end
            for p in ps_observ
                Tracker.update!(opt, p, -Tracker.grad(p))
            end
#             zero_grad!(ps_observ)
#             zero_grad!(ps_hidden)
            clds_g.h.data .= X̂.data[:,end]
        end
        println(sqrt(mean(history[(1:28) .+ 28*(ee-1)])))
        clds_g.h.data .= zeros(size(clds_g, 1))
    end
    history
end;

In [83]:
plot(sqrt.(conv(h, Windows.rect(28))[28+200:end-27]/28))

## Initialising the Transition Matrix

The transition matrix will be parameterised as:

$$ A = D(\psi_2) Q(\psi_1) $$

where $D$ is a diagonal matrix with elements in $[0,1]$ and $Q$ is a special orthogonal matrix with determinant $+1$. Our goal will be to use the initialisation calculated above, coerced into this form.

Unfortunately, we are just as likely to have an initial $A_0$ with determinant $-1$ as $+1$. My previous arguments about measure zero $\lambda = 1 - 0i$ are not true. It appears that reflections are commonly learned. Therefore, we must deal with this issue later. But first:

Decompose:
$$ A_0 = U_0 S_0 V_0^T $$

Then $D(\psi_2) = S_0$ and $Q(\psi_1) = U_0 V_0^T$.


### Diagonal Matrix
If parameterising the diagonal of $D$ with a sigmoid nonlinearity, we must apply the inverse sigmoid to $\text{diag}(S)$, i.e. $\sigma^{-1}(y) = \ln\left(\frac{y}{1-y}\right)$. Care must be taken to avoid the endpoints $y \in \{0, 1\}$ for numerical reasons, but also because it is not sensible to initialise to a position with no gradient. I have used a minimum distance from the boundaries of $10^{-3}$ (which translates to $x \approx \pm 6.9$). 


### Orthogonal Matrix
In order to obtain the Cayley parameterisation of $Q = U_0 V_0^T$ we take the inverse Cayley transformation $S = (I - Q)(I + Q)^{-1}$ to obtain the skew-symmetric matrix $S$ which corresponds 1-to-1 to $Q$. We can then simply extract the lower triangular elements of $S$ as the unique $d(d-1)/2$ elements parameterising $Q$. If these parameters are $\psi_1$, I will write this as $\psi_1 = \text{cayley}^{-1}(Q)$. However, as we have said above, this is insufficient for obtaining a Cayley parameterisation of the estimate $A_0$ in general, since we exclude any $A_0$ s.t. $\det(A_0) = -1$. Ferrar (1950) tells us that a general orthogonal matrix can be parameterised as $J(I-S)(I+S)^{-1}$, where $J$ is a diagonal matrix with elements in $\{+1, -1\}$. Crucially we need as many negative elements ($-1$) as their are negative roots of $Q_0$ and we may choose them for convenience to precede all of the positive elements in $J$.

#### Corollary

1. $ A = D(\psi_2) Q(\psi_1) $ as before, but now with $D$ containing elements in $[-1,1]$ is sufficient to parameterise *any* orthogonal matrix. Note that we may instead use $\tanh$ instead of $\sigma$ to achieve this.
2. For the problem at hand, we need a *special* orthogonal matrix in order to apply the inverse Cayley transform. Now we know that $Q$ can be represented as 
    $$Q = J\tilde{Q}$$
    for $\tilde{Q} \in SO(d)$. Then clearly $\tilde{Q} = JQ$ and hence we have that 
    $$Q = J\,\text{cayley}\left(\text{cayley}^{-1}(JQ)\right).$$

Therefore we can parameterise $A_0 = J\,D(\psi_2) Q(\psi_1)$ where $\psi_1 =  \text{cayley}^{-1}(JQ)$ where $J$ is a $\{+1,-1\}$ diagonal calculated directly from the eigenvalues of $Q=U_0 V_0^T$.


## Visualise results

In [70]:
Ys_angry = reduce(vcat, Ys[1:5]);

In [71]:
Yhat = clds(cUT);

In [968]:
size(cYs)

In [72]:
# printfmtln("RMSE = {:.3f}", rmse(cYs - Yhat)); flush(stdout)

fig, axs = subplots(5,4,figsize=(10,10))
offset = 0
offset_tt = 1000
_Δt = 100
for i = 1:20
    axs[:][i].plot(Ys_angry[(1:_Δt-1) .+ offset_tt, i+offset])
    axs[:][i].plot(Yhat'[(1:_Δt-1) .+ offset_tt, i+offset])
end

### Animate!

In [172]:
?mocapio.construct_outputs

In [None]:
# # stupid reload module problems
# _standardize_Y = fit(model.mocaputil.MyStandardScaler, reduce(vcat, Ysraw),  1)
# _standardize_U = fit(model.mocaputil.MyStandardScaler, reduce(vcat, Usraw),  1)

In [932]:
tmp_unsup = model.unsup_predict(clds, cUT[:,1:1000], cYs[:,1:200], _standardize_Y, _standardize_U, 20);
Yhat_r_unsup = invert(standardize_Y, Matrix(tmp_unsup'));

In [933]:
rmse(tmp_unsup[:,1:200] - cYs[:,1:200])

In [950]:
tmp = clds(cUT[:,1:3000]);

In [344]:
Yhat_r = invert(standardize_Y, Matrix(tmp'));

In [948]:
size(Ysraw[2])

In [364]:
if !(@isdefined vis) 
    vis = Visualizer() # Create a new visualizer instance (MeshCat.jl)
    open(vis)
end
vis = mocapviz.create_animation([mocapio.reconstruct_modelled(Yhat_r)[5001:6000,:,:], 
                                 mocapio.reconstruct_modelled(reduce(vcat, Ysraw[1:5]))[5001:6000,:,:]], 
    "test"; vis=vis, linemesh=[mocapviz.redmesh, mocapviz.yellowmesh], camera=:back)

In [935]:
vis = mocapviz.create_animation([mocapio.reconstruct_modelled(Yhat_r_unsup)[1:500,:,:], 
                                 mocapio.reconstruct_modelled(Ysraw[1][1:500,:])], 
    "test"; vis=vis, linemesh=[mocapviz.redmesh, mocapviz.yellowmesh], camera=:back)

In [None]:
x̂ = rnn_state_rollout(rnn, cUT[:,1:1000]).data
#         display(D*_cU)
Yhat = ffnn(x̂) + C*x̂ + D*cUT[:,1:1000];

In [297]:
Yhat_r = invert(standardize_Y, Matrix(Yhat'));

# Are RNNs easier to train?

(** Answer: No... not by a long shot!**)

In [161]:
d_out = size(cYs, 1)
d_in = size(cUT, 1)
d_state = 200
d_ff = 50
diffdmodel = 0  #63

rnn = RNN(d_in, d_state, tanh)
ffnn = Chain(Dense(d_state, d_ff), Dense(d_ff, d_out, identity))

a = param(zeros(Float32, Int(d_state*(d_state-1)/2)))
cUs_m1 = cUs[1:end-1,:];
CDd = cYs[:,2:end] / [cUs_m1'; ones(1, cN-1)]
D, C = Flux.param(deepcopy(CDd[:,1:end-1])), param(Flux.glorot_uniform(d_out, d_state))
ffnn.layers[2].b.data .= CDd[:,end];

In [167]:
# opt = ADAM(1e-3)
pars = Flux.params(rnn, ffnn.layers[1], ffnn.layers[2], C, D, a);

In [296]:
zero_grad!(pars)
Flux.reset!(rnn)
Flux.truncate!(rnn)

In [166]:
# opt = ADAM(1e-7)
pars = Flux.params(rnn.cell.Wi, rnn.cell.b, ffnn, C, D, a);

In [311]:
tmp = Tracker.collect(reduce(hcat, rnn.(cUList))).data   # weirdly mapleaves(data,...) version is 100x slower...?
_pre_linear = [mapleaves(Tracker.data, ffnn.layers[1])(tmp); tmp; cUT; ones(1, cN)]
CDd = model._tikhonov_mrdivide(cYs, _pre_linear, 1f-5);
ffnn.layers[2].W.data .= CDd[:,1:d_ff];
ffnn.layers[2].b.data .= CDd[:,end];
C.data .= CDd[:,(d_ff+1):(d_ff+d_state)];
D.data .= CDd[:,(d_ff+d_state+1):end-1];

In [313]:
Yhat = ffnn(tmp).data + C.data*tmp + D.data*cUT;

In [312]:
rmse(ffnn(tmp).data + C.data*tmp + D.data*cUT - cYs)

In [250]:
@benchmark model.state_rollout(clds_g, cUT[:,1:100])

In [252]:
function rnn_state_rollout(rnn::Flux.Recur, U::AbstractMatrix{T}) where T <: AbstractFloat
    n = size(U, 2)
    return hcat([rnn(U[:,i]) for i in 1:n]...)
end

In [276]:
@benchmark rnn_state_rollout(rnn, cUT[:,1:100])

In [270]:
rnn.cell.Wi * randn(121) + rnn.cell.Wh * rnn.state

In [247]:
typeof(rnn)

In [307]:
n_epochs = 300
history = zeros(Float32, n_epochs)
N = size(cYs, 2)
cUList = [cUT[:,i] for i in 1:N]

opt.eta = 2e-3 / 30
@time for ee in 1:n_epochs
    batch_order = randperm(28)
    for tt in batch_order
        ixs = (256*(tt-1)+1):min(256*tt, N)
        _cY, _cU, _cUL = cYs[:,ixs[1:end]], cUT[:, ixs[1:end]], cUList[ixs[1:end]]
        rnn.cell.Wh = AxUtil.Math.cayley_orthog(a/10, d_state)
#         x̂ = rnn.(_cUL)
#         x̂ = Tracker.collect(reduce(hcat, x̂))
        x̂ = rnn_state_rollout(rnn, _cU)
#         display(D*_cU)
        ŷ = ffnn(x̂) + C*x̂ + D*_cU
        obj = mean(x->x^2, _cY - ŷ)*64^2
        Tracker.back!(obj)
        history[ee] += obj.data / length(batch_order)
        for p in pars
            Tracker.update!(opt, p, -Tracker.grad(p))
        end
        Flux.reset!(rnn)
        Flux.truncate!(rnn)
#         zero_grad!(pars)
    end
    println(sqrt(history[ee])); flush(stdout)
end

In [310]:
plot(sqrt.(history[1:155]))

In [519]:
for ee in 1:1
    h = 0
    batch_order = randperm(58)
    for tt in batch_order
        ixs = (256*(tt-1)+1):min(256*tt, N)
        _cY, _cU, _cUL = cYs[:,ixs[2:end]], cUT[:, ixs[1:end-1]], cUList[ixs[1:end-1]]
        ŷ = D*_cU .+ ffnn.layers[2].b
        obj = mean(x->x^2, _cY - ŷ)
        h += obj.data
    end
    println(sqrt(h/length(batch_order)))
end

In [338]:
let tmpr1 = mapslices(z->sqrt(mean(x->x^2, z)), cYs[:,1:end] - Yhat, dims=1)[1:end]
    plot(conv(f64(tmpr1), Windows.rect(30))[:31:end-30])
end
title("error over time")

In [329]:
plot(1:100, 1:100)
plot(11:100, conv(collect(1:100), Windows.rect(10))[11:end-9])

In [336]:
let tmpr1 = cYs - Yhat
    tmpr2 = tmpr1[:,1:end-30]
    for j = 1:64
        tmpr2[j,:] = conv(f64(tmpr1[j,:]), Windows.rect(30))[:31:end-29]
    end
    imshow(tmpr2, aspect="auto"); colorbar()
end
title("error over time")