# Mocap Initial Modelling

* Deterministic LDS


In [1]:
# using Revise
using LinearAlgebra, Random
using StatsBase, Statistics
using Distributions, MultivariateStats   # Categorical, P(P)CA
using Quaternions    # For manipulating 3D Geometry
using MeshCat        # For web visualisation / animation
using PyPlot         # Plotting
using AxUtil         # Cayley, skew matrices
using Flux           # Optimisation
using DSP            # convolution / low-pass (MA) filter

# small utils libraries
using ProgressMeter, Formatting, ArgCheck
using DelimitedFiles, NPZ, BSON

In [2]:
function zero_grad!(P) 
    for x in P
        x.grad .= 0
    end
end

In [3]:
DIR_MOCAP_MTDS = "." 

# Data loading and transformation utils
include(joinpath(DIR_MOCAP_MTDS, "io.jl"))

# MeshCat skeleton visualisation tools
include(joinpath(DIR_MOCAP_MTDS, "mocap_viz.jl"))

# Data scaling utils
include(joinpath(DIR_MOCAP_MTDS, "util.jl"))
import .mocaputil: MyStandardScaler, scale_transform, invert
import .mocaputil: OutputDifferencer, difference_transform, fit_transform
import .mocaputil: no_pos, no_poscp

# Models: LDS
include(joinpath(DIR_MOCAP_MTDS, "models.jl"))

### Load in Data
See `2_Preprocess.ipynb`

**Note that in the current harddisk state, `edin_Ys.bson` was created with `include_ftcontact=false`**


### Additional changes since `2_Preprocess`:

1. Performing a difference transform of joint positions in $Y$. Motivated by trivial predictions of copying previous frame working too well. I want to force the model to learn something.
2. Remove joint positions entirely from inputs:
    * Don't want prev positions for regression to output ($D$ matrix) as predicting a copy reduces the error close to zero, and makes it difficult to understand which model is performing best for the remaining delta.
    * Also don't want in latent state, as positions have already been projected to the latent state (a linear comb) => hence colinear.


In [4]:
database = "../data/mocap/edin-style-transfer/"
files_edin = [joinpath(database, f) for f in readdir(database)];
style_name_edin = [x[1] for x in match.(r"\.\./[a-z\-]+/[a-z\-]+/[a-z\-]+/([a-z]+)_.*", files_edin)];
styles = unique(style_name_edin)
styles_lkp = [findall(s .== style_name_edin) for s in styles];

In [5]:
Usraw = BSON.load("edin_Xs.bson")[:Xs];
Ysraw = BSON.load("edin_Ys.bson")[:Ys];

Ys_dtform = [fit_transform(OutputDifferencer, y) for y in Ysraw];
Ys, dtforms = [y[2] for y in Ys_dtform], [y[1] for y in Ys_dtform];

standardize_Y = fit(MyStandardScaler, reduce(vcat, Ys),  1)
standardize_U = fit(MyStandardScaler, reduce(vcat, Usraw),  1)

Ys = [scale_transform(standardize_Y, y) for y in Ys];
Us = [scale_transform(standardize_U, u[1:end-1,:]) for u in Usraw];  # remove last frame to align with ys

In [6]:
invert_output_tform(y, i) = invert(standardize_Y, y) |> yhat -> invert(dtforms[i], yhat)

we can reconstruct the original data via the commands:

    invert_output_tform(y, i)       # for (i,y) in enumerate(Ys)
    invert(standardize_X, x)        # for x in Xs
    
in the relevant array comprehensions.

In [7]:
?mocapio.construct_inputs

In [8]:
?mocapio.construct_outputs

In [36]:
rmse(clds_ss(cUT) - cYs)

# Initialise LDS

## Understand PC distribution

#### Single-task Model

In [7]:
fig, axs = subplots(4,2,figsize=(5.5,6))

for (i, ixs) in enumerate(styles_lkp)
    cYs = reduce(vcat, Ys[ixs])
    pc_all = fit(PCA, cYs[:,4:63]', pratio=0.999)

    varexpl = cumsum(principalvars(pc_all))/tvar(pc_all)
    bd=findfirst.([varexpl .> x for x in [0.9,0.95,0.99]])
    axs[:][i].plot(1:length(varexpl), varexpl)
    axs[:][i].axhline(1, linestyle=":")
    for b in bd
        axs[:][i].plot([b,b], [varexpl[1], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
        axs[:][i].plot([.5, b], [varexpl[b], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
        axs[:][i].text(b+0.3,varexpl[1]+0.03, b)
    end
    axs[:][i].set_xlim(0.5,34.5); gca().set_ylim(varexpl[1],1.025);
    axs[:][i].xaxis.set_ticklabels([])
    axs[:][i].set_title(styles[i])
end
tight_layout()

#### Pooled Model

In [8]:
allE = reduce(vcat, Ys);
allE = convert(Matrix{Float32}, allE);

pc_all = fit(PCA, allE[:,4:63]', pratio=0.999)

varexpl = cumsum(principalvars(pc_all))/tvar(pc_all)
bd=findfirst.([varexpl .> x for x in [0.9,0.95,0.99]])
plot(1:length(varexpl), varexpl)
gca().axhline(1, linestyle=":")
for b in bd
    plot([b,b], [varexpl[1], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
    plot([.5, b], [varexpl[b], varexpl[b]], color=ColorMap("tab10")(7), linestyle=":")
    text(b+0.3,varexpl[1]+0.03, b)
end
gca().set_xlim(0.5,37.5); gca().set_ylim(varexpl[1],1.025);
gcf().set_size_inches(3,2)

## LDS Initialisation

\begin{align}
\mathbf{x}_t &= A \mathbf{x}_{t-1} + B \mathbf{u}_t + \mathbf{b}\\
\mathbf{y}_t &= C \mathbf{x}_{t} + D \mathbf{u}_t + \mathbf{d}
\end{align}

#### Initialisation

Note in the below I use the SVD construction for PCA for convenience, but in general $Y$ is not centered ($\because$ centering is done over \emph{all} styles simultaneously; each individual will not be centered). Therefore, the below assumes this centering is done temporarily before the SVD.

* $C = U_{SVD}$, where $U_{SVD}$ are the prinicipal components of $Y$.
* $X = S_{SVD}V_{SVD}^{\mathsf{T}}$, where $S_{SVD}$, $V_{SVD}$ are the other matrices from the SVD.
* $X \approx \tilde{U}\tilde{B} \Rightarrow \tilde{B} = (\tilde{U}^{\mathsf{T}} \tilde{U})^{-1} \tilde{U}^{\mathsf{T}} X$ (Regression of $X$ on $U$). The permutation of $U$ and $B$ in the first equation follows because the obs are column-wise here, not row-wise.
    * Here, $\tilde{U} = \begin{bmatrix} U & \mathbf{1} \end{bmatrix}$, and hence $\tilde{B} = \begin{bmatrix} B & \mathbf{b} \end{bmatrix}$

## Single task LDS

In [27]:
rmse(Δ) = sqrt(mean(x->x^2, Δ))

In [12]:
style_ix = 1
cYs = Matrix(transpose(reduce(vcat, Ys[styles_lkp[style_ix]])))
cUs = reduce(vcat, Us[styles_lkp[style_ix]])
cUT = Matrix(cUs');
cN = size(cYs, 2)

In [28]:
# Simple baselines
baselines = Dict()
baselines["copy"] = rmse(cYs[:,2:end] - cYs[:,1:end-1])

cUs_m1 = cUs[1:end-1,:];
CDd = cYs[:,2:end] / [cUs_m1'; ones(1, cN-1)]
baselines["LR"] = rmse(cYs[:,2:end] - CDd * [cUs_m1'; ones(1, cN-1)])

In [29]:
perf_init = [rmse(model.init_LDS_spectral(cYs, cUT, k)(cUT) - cYs) for k in 5:5:60];

In [25]:
plot(5:5:60, perf_init); gcf().set_size_inches(3,2)
gca().axhline(baselines["copy"], linestyle=":")
gca().axhline(baselines["LR"], linestyle=":", color="red")

In [75]:
k = 30
    
clds_g = model.make_grad(model.init_LDS_spectral(cYs, cUT, k))
clds   = model.make_nograd(clds_g)   # MUST DO THIS SECOND, (Flux.param takes copy)

opt = ADAM(1e-4)
opt_hidden = ADAM(0.7e-5)

In [82]:
opt_hidden = ADAM(0.7e-5)
ps_hidden = Flux.params(clds_g.a, clds_g.B, clds_g.b)
ps_observ = Flux.params(clds_g.C, clds_g.D, clds_g.d)

In [94]:
ps_hidden = Flux.params(clds_g.a, clds_g.B, clds_g.b)
ps_observ = Flux.params(clds_g.C, clds_g.D, clds_g.d)

In [102]:
@time h = begin
    n_epochs = 150
    history = zeros(n_epochs*58)
    N = size(cYs, 2)
    
    ps = pars(clds_g)

#     opt = ADAM(1e-4)
    opt.eta = 5e-4 / 5   #/2
    opt_hidden.eta = 0.5e-5
#     opt.eta *= 2
    for ee in 1:n_epochs
        if ee % 100 == 0
            opt.eta /= 1.5
            printfmtln("Changed learning rate!")
        end
        batch_order = randperm(58)
        for tt in batch_order
            ixs = (256*(tt-1)+1):min(256*tt, N)
            _cY, _cU = cYs[:,ixs[2:end]], cUT[:, ixs[1:end-1]]
            Yhat = clds_g(_cU)
            obj = mean(x->x^2, _cY - Yhat)
            history[(ee-1)*58 + tt] = obj.data
            Tracker.back!(obj)
            for p in ps_hidden
                Tracker.update!(opt_hidden, p, -Tracker.grad(p))
            end
            for p in ps_observ
                Tracker.update!(opt, p, -Tracker.grad(p))
            end
        end
        println(sqrt(mean(history[(1:58) .+ 58*(ee-1)])))
    end
    history
end;

In [103]:
plot(sqrt.(conv(h, Windows.rect(58))[1000:end-57]/58))

## Initialising the Transition Matrix

The transition matrix will be parameterised as:

$$ A = D(\psi_2) Q(\psi_1) $$

where $D$ is a diagonal matrix with elements in $[0,1]$ and $Q$ is a special orthogonal matrix with determinant $+1$. Our goal will be to use the initialisation calculated above, coerced into this form.

Unfortunately, we are just as likely to have an initial $A_0$ with determinant $-1$ as $+1$. My previous arguments about measure zero $\lambda = 1 - 0i$ are not true. It appears that reflections are commonly learned. Therefore, we must deal with this issue later. But first:

Decompose:
$$ A_0 = U_0 S_0 V_0^T $$

Then $D(\psi_2) = S_0$ and $Q(\psi_1) = U_0 V_0^T$.


### Diagonal Matrix
If parameterising the diagonal of $D$ with a sigmoid nonlinearity, we must apply the inverse sigmoid to $\text{diag}(S)$, i.e. $\sigma^{-1}(y) = \ln\left(\frac{y}{1-y}\right)$. Care must be taken to avoid the endpoints $y \in \{0, 1\}$ for numerical reasons, but also because it is not sensible to initialise to a position with no gradient. I have used a minimum distance from the boundaries of $10^{-3}$ (which translates to $x \approx \pm 6.9$). 


### Orthogonal Matrix
In order to obtain the Cayley parameterisation of $Q = U_0 V_0^T$ we take the inverse Cayley transformation $S = (I - Q)(I + Q)^{-1}$ to obtain the skew-symmetric matrix $S$ which corresponds 1-to-1 to $Q$. We can then simply extract the lower triangular elements of $S$ as the unique $d(d-1)/2$ elements parameterising $Q$. If these parameters are $\psi_1$, I will write this as $\psi_1 = \text{cayley}^{-1}(Q)$. However, as we have said above, this is insufficient for obtaining a Cayley parameterisation of the estimate $A_0$ in general, since we exclude any $A_0$ s.t. $\det(A_0) = -1$. Ferrar (1950) tells us that a general orthogonal matrix can be parameterised as $J(I-S)(I+S)^{-1}$, where $J$ is a diagonal matrix with elements in $\{+1, -1\}$. Crucially we need as many negative elements ($-1$) as their are negative roots of $Q_0$ and we may choose them for convenience to precede all of the positive elements in $J$.

#### Corollary

1. $ A = D(\psi_2) Q(\psi_1) $ as before, but now with $D$ containing elements in $[-1,1]$ is sufficient to parameterise *any* orthogonal matrix. Note that we may instead use $\tanh$ instead of $\sigma$ to achieve this.
2. For the problem at hand, we need a *special* orthogonal matrix in order to apply the inverse Cayley transform. Now we know that $Q$ can be represented as 
    $$Q = J\tilde{Q}$$
    for $\tilde{Q} \in SO(d)$. Then clearly $\tilde{Q} = JQ$ and hence we have that 
    $$Q = J\,\text{cayley}\left(\text{cayley}^{-1}(JQ)\right).$$

Therefore we can parameterise $A_0 = J\,D(\psi_2) Q(\psi_1)$ where $\psi_1 =  \text{cayley}^{-1}(JQ)$ where $J$ is a $\{+1,-1\}$ diagonal calculated directly from the eigenvalues of $Q=U_0 V_0^T$.


## Visualise results

In [114]:
Yhat = clds(cUT);

In [220]:
printfmtln("RMSE = {:.3f}", rmse(cYs - Yhat)); flush(stdout)

fig, axs = subplots(5,4,figsize=(10,10))
offset = 20
offset_tt = 0
for i = 1:20
    axs[:][i].plot(Ys[1][(1:100) .+ offset_tt, i+offset])
    axs[:][i].plot(Yhat'[(1:100) .+ offset_tt, i+offset])
end

### Animate!

In [172]:
?mocapio.construct_outputs

In [240]:
function unsup_predict(lds::model.MyLDS_ng{T}, U::AbstractMatrix{T}, 
        Yraw5::AbstractVector{T}, YsTrue::AbstractMatrix{T}, standardize_Y::MyStandardScaler,
        standardize_U::MyStandardScaler) where T <: AbstractFloat
    
    n = size(U, 2)
    A = model.Astable(lds)

    X = Matrix{T}(undef, size(lds, 1), n);
    Y = Matrix{T}(undef, size(lds, 2), n)
    u = Vector{T}(undef, size(lds, 3))
    
    # transform Y -> U
    cy   = Yraw5
    μ, σ = standardize_U.μ[61:121], standardize_U.σ[61:121]
    
    X[:,1] = A*lds.h + lds.B*U[:, 1] + lds.b
    Y[:,1] = lds.C * X[:,1] + lds.D * U[:,1] .+ lds.d
    for i in 2:n
        u = U[:,i]   # I found some unexpected behaviour when using views here.
        if i > 5
            y_unnorm = invert(standardize_Y, reshape(YsTrue[:,(i-1)], 1, 64)) |> vec
            cy = y_unnorm + cy  # (cum)sum here inverts differencing.
#             display( hcat((cy[4:64] - μ) ./ σ, u[61:121]))
            u[61:121] = (cy[4:64] - μ) ./ σ            # transform to u space
        end
        @views X[:,i] = A*X[:,i-1] + lds.B*u + lds.b
        @views Y[:,i] = lds.C * X[:,i] + lds.D * u .+ lds.d
    end
    return Y
end


In [255]:
Ysraw[1][1:5,4:64]

In [256]:
_dtform = fit(OutputDifferencer, Ysraw[1][1:2,:])
tmp = invert(standardize_Y, Matrix(cYs[:,1:5]')) |> yhat -> invert(_dtform, yhat)
tmp[:,4:64]

In [253]:
invert(standardize_U, Matrix(cUT[:,1:5]'))[:,61:121]

In [None]:
function invert_from_frame(file_ix, start_frame, y)
    _dtform = fit(OutputDifferencer, Ysraw[file_ix][start_frame:start_frame+1,:])
    invert(standardize_Y, y) |> yhat -> invert(_dtform, yhat)
end

In [213]:
(tmp[:,4:64] .- standardize_U.μ[61:121]') ./ standardize_U.σ[61:121]'

In [241]:
rmse(unsup_predict(clds, cUT[:,1:100], Ysraw[1][5,:], cYs[:,1:100], standardize_Y, standardize_U) - cYs[:,1:100])

In [242]:
rmse(clds(cUT[:,1:100]) - cYs[:,1:100])

In [216]:
cUT[61:121,1:2]

In [193]:
plot(unsup_predict(clds, cUT[:,1:20], Ysraw[1][1,:], standardize_Y, standardize_U)[1,1:10])

In [123]:
function invert_from_frame(file_ix, start_frame, y)
    _dtform = fit(OutputDifferencer, Ysraw[file_ix][start_frame:start_frame+1,:])
    invert(standardize_Y, y) |> yhat -> invert(_dtform, yhat)
end
Yhat_r = begin; ifr=501; Yr=Matrix(clds(cUT[:,ifr:ifr+1000])'); invert_from_frame(1, ifr, Yr); end;

In [128]:
if !(@isdefined vis) 
    # Create a new visualizer instance (MeshCat.jl)
    vis = Visualizer()
    open(vis)
end
vis = mocapviz.create_animation([mocapio.reconstruct_modelled(Yhat_r)[1:500,:,:], 
                                 mocapio.reconstruct_modelled(Ysraw[1][501:1000,:])], 
    "test"; vis=vis, linemesh=[mocapviz.redmesh, mocapviz.yellowmesh], camera=:back)

# Are RNNs easier to train?

(** Answer: No... not by a long shot!**)

In [545]:
d_out = size(cYs, 1)
d_in = size(cUT, 1)
d_state = 30
d_ff = 50
diffdmodel = 0  #63

rnn = RNN(d_in, d_state, elu)
ffnn = Chain(Dense(d_state, d_ff), Dense(d_ff, d_out, identity))

a = param(zeros(Float32, Int(d_state*(d_state-1)/2)))
cUs_m1 = cUs[1:end-1,:];
CDd = cYs[:,2:end] / [cUs_m1'; ones(1, cN-1)]
D, C = Flux.param(deepcopy(CDd[:,1:end-1])), param(Flux.glorot_uniform(d_out, d_state))
ffnn.layers[2].b.data .= CDd[:,end];

In [536]:
# opt = ADAM(1e-3)
pars = Flux.params(rnn, ffnn.layers[1], ffnn.layers[2], C, D, a);

In [610]:
zero_grad!(pars)
Flux.reset!(rnn)
Flux.truncate!(rnn)

In [606]:
# opt = ADAM(1e-7)
pars = Flux.params(rnn.cell.Wi, rnn.cell.b, ffnn, C, D, a);

In [615]:
?mocapio.construct_inputs

In [618]:
lds_cell.B.data

In [None]:
n_epochs = 100
history = zeros(Float32, n_epochs)
N = size(cYs, 2)
cUList = [cUT[:,i] for i in 1:N]

opt.eta = 2e-3 / 40
for ee in 1:n_epochs
    batch_order = randperm(58)
    for tt in batch_order
        ixs = (256*(tt-1)+1):min(256*tt, N)
        _cY, _cU, _cUL = cYs[:,ixs[2:end]], cUT[:, ixs[1:end-1]], cUList[ixs[1:end-1]]
        rnn.cell.Wh = AxUtil.Math.cayley_orthog(a/10, d_state)
        x̂ = rnn.(_cUL)
        x̂ = Tracker.collect(reduce(hcat, x̂))
        ŷ = ffnn(x̂) + C*x̂ + D*_cU
        obj = mean(x->x^2, _cY - ŷ)
        Tracker.back!(obj)
        history[ee] += obj.data / length(batch_order)
        for p in pars
            Tracker.update!(opt, p, -Tracker.grad(p))
        end
        Flux.reset!(rnn)
        Flux.truncate!(rnn)
    end
    println(sqrt(history[ee]))
end

In [None]:
plot(history)

In [519]:
for ee in 1:1
    h = 0
    batch_order = randperm(58)
    for tt in batch_order
        ixs = (256*(tt-1)+1):min(256*tt, N)
        _cY, _cU, _cUL = cYs[:,ixs[2:end]], cUT[:, ixs[1:end-1]], cUList[ixs[1:end-1]]
        ŷ = D*_cU .+ ffnn.layers[2].b
        obj = mean(x->x^2, _cY - ŷ)
        h += obj.data
    end
    println(sqrt(h/length(batch_order)))
end