# Stochastic gradient descent
- three variants
    - size-1 mini-batches (batch_size = 1), zips through all observed entries of $\mbox{cov}(y_{t+m},y_t)$ and computes gradients from one at time
    - column mini-batches (batch_size = p), zips through all columns of $\mbox{cov}(y_{t+m},y_t)$ and computes gradients from observed entries 
    - batch-gradients (batch_size = None), computes a full gradient using all observed entries $\mbox{cov}(y_{t+m},y_t)$ at the same time
    
- mini-batch gradients use Adam for following the gradients with momentum and with re-normalising of gradients along each dimension. Full gradients use plain gradient descent.
- max_iter is defined as the number of 'zips' through the data set. Thus for different batch sizes, max_iter fixes the amount of information visited within the data covariance matrices, *not* the number of gradient steps 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy.optimize import fmin_bfgs, check_grad
import glob, os

os.chdir('../core')
from utility import get_subpop_stats, draw_sys
from SSID_Hankel_loss import f_l2_Hankel, l2_sis_setup, g_l2_Hankel_sis, plot_outputs_l2_gradient_test
from SSID_Hankel_loss import yy_Hankel_cov_mat, l2_sis_draw, adam_zip, adam_zip_stable
os.chdir('../dev')

p,n = 30,10
k,l = 3,3

nr = 4

batch_size = p # batch_size = 1 (size-1 mini-batches), p (column mini-batches), None (full gradients)

a, a_A, b1, b2, e = 0.0001, 0.000001, 0.9, 0.99, 1e-8
max_iter = 1000

gammas = np.array([0.000000])
tau = 0.5

# create subpopulations
sub_pops = (np.arange(0,p//2+1), np.arange(p//2-1,p))

# draw system matrices    
ev_r = np.linspace(0.7, 0.99, nr)
ev_c = np.exp(2 * 1j * np.pi * np.random.uniform(size= (n - nr)//2))
ev_c = np.linspace(0.8, 0.99, (n - nr)//2) * ev_c

if p < 200:
    print('sub_pops', sub_pops)
obs_idx, idx_grp, co_obs, overlaps, overlap_grp, idx_overlap, Om, Ovw, Ovc = \
    get_subpop_stats(sub_pops, p, verbose=False)
pars_true, Qs, Qs_full = draw_sys(p,n,k,l,Om, nr, ev_r,ev_c)
f_base, _ = l2_sis_setup(k,l,n,Qs,Om,idx_grp,obs_idx)


def s(A):
    s = np.linalg.svd(A)[1]
    return np.isfinite( np.log( 1 - s**2 ).sum() )

#def s(A):
#    return True

err_est  = np.zeros((gammas.size, 4))
eigA_est = np.zeros((gammas.size, n))  
for rep in range(gammas.size):
    
    gamma = gammas[rep]
    
    def f_i(theta):

        A = theta[:n*n].reshape(n,n)        
        f_log_bar = - np.log( np.linalg.det(np.eye(n)-A.dot(A.T)) )

        if gamma == 0 and not np.isfinite(f_log_bar):
            f_log_bar = 0

        return f_base(theta) + gamma * f_log_bar
    
    def g_i(theta, idx_use, idx_co):

        A = theta[:n*n].reshape(n,n)        
        inv = np.linalg.solve(np.eye(n)-A.dot(A.T), np.eye(n))
        g_log_bar = np.zeros(theta.size)                                 
        g_log_bar[:n*n] = 2 * inv.dot(A).reshape(-1,)

        gamma_g_log_bar = gamma * g_log_bar
        if gamma == 0:
            gamma_g_log_bar[np.invert(np.isfinite(gamma_g_log_bar))] = 0
        return g_l2_Hankel_sis(theta,k,l,n,Qs,idx_use,idx_co) + gamma_g_log_bar
    
    A_0  = np.diag(np.random.uniform(low=0.7, high=0.8, size=n))
    B_0  = np.eye(n) #np.random.normal(size=(n,n))
    C_0  = np.random.normal(size=(p,n))
    Pi_0 = B_0.dot(B_0.T)
    
    pars_0 = np.hstack((A_0.reshape(n*n,),
                        B_0.reshape(n*n,),
                        C_0.reshape(p*n,)))


    def converged(theta_old, theta, e, t):
        if t >= max_iter:
            return True
        return False
        #return np.abs(f_i(theta_old) - f_i(theta)) < e
    
    print('starting descent')
    
    pars_est_vec, fs = adam_zip_stable(f_i,g_i,s,tau,pars_0.copy(),a,a_A,b1,b2,e,max_iter,converged,Om,idx_grp,co_obs,batch_size)

    A_est = pars_est_vec[:n*n].reshape(n,n)
    B_est = pars_est_vec[n*n:2*n*n].reshape(n,n)
    Pi_est = B_est.dot(B_est.T)
    C_est = pars_est_vec[-p*n:].reshape(p,n)

    print('gamma =', gamma)

    eigA_est[rep,:] = np.abs(np.sort(np.linalg.eigvals(A_est)))
    print('|eig(A_est)|', eigA_est[rep,:])
    print('|eig(A_true)|', np.abs(np.sort(np.linalg.eigvals(pars_true['A']))))


    err_est[rep,0] = f_l2_Hankel(pars_est_vec,k,l,n,Qs, Om)
    err_est[rep,1] = f_l2_Hankel(pars_est_vec,k,l,n,Qs,Ovw)
    err_est[rep,2] = f_l2_Hankel(pars_est_vec,k,l,n,Qs,Ovc)
    err_est[rep,3] = f_l2_Hankel(pars_est_vec,k,l,n,Qs_full,~Om)

    print('final squared error on observed parts:', 
          err_est[rep,0])
    print('final squared error on overlapping parts:', 
          err_est[rep,1])
    print('final squared error on cross-overlapping parts:',
          err_est[rep,2])
    print('final squared error on stitched parts:',
          err_est[rep,3])

    pars_init = {'A': A_0, 'C': C_0, 'Pi': Pi_0, 'B': B_0}
    pars_est  = {'A': A_est, 'C': C_est, 'Pi': Pi_est, 'B': B_est}
    #plot_outputs_l2_gradient_test(pars_true, pars_init, pars_est, k, l, Qs, 
    #                                   Qs_full, Om, Ovc, Ovw, f_i, g_i, if_flip = True)

    plt.figure(figsize=(20,8))
    plt.plot(fs[:max_iter])
    plt.show()


# visualise overall results

plt.figure(figsize=(20,8))
plt.hsv()
plt.plot(err_est)
plt.hsv()
plt.xticks( np.arange(gammas.size), gammas)
plt.xlabel('\gamma')
plt.ylabel('MSE')
plt.legend(['obs.', 'overlap', 'cross-overl.', 'stitched'])
plt.title('Squared errors as function of log-barrier height')
plt.show()

plt.figure(figsize=(20,8))
clrs = np.zeros((gammas.size, 3))
clrs[:,2] = np.linspace(0.05, 0.99, gammas.size)
clrs[:,0] = np.linspace(0.05, 0.99, gammas.size)[::-1]
for i in range(gammas.size):    
    plt.plot(eigA_est[i,:], color=clrs[i,:])
    plt.hold(True)
plt.plot(np.sort(np.abs(np.linalg.eigvals(pars_true['A']))), 'k')
plt.plot([0, 1.1*n], [1, 1], 'r--')
plt.hot()
plt.xticks( np.arange(n), np.arange(n)+1)
plt.xlabel('# eigenvalue')
plt.ylabel('EV')
lgnd = [np.ceil(gammas[i]*100)/100 for i in range(gammas.size)]
lgnd.append('true')
lgnd.append('stability')
plt.legend(lgnd)
plt.axis([0, 1.1*n, plt.ylim()[0], plt.ylim()[1]])
plt.title('Eigenvalues as function of log-barrier height')
plt.show()

    

# just one more turn...

In [None]:
gamma /= 10
def f_i(theta):

    A = theta[:n*n].reshape(n,n)        
    f_log_bar = - np.log( np.linalg.det(np.eye(n)-A.dot(A.T)) )

    if gamma == 0 and not np.isfinite(f_log_bar):
        f_log_bar = 0

    return f_base(theta) + gamma * f_log_bar
def g_i(theta, idx_use, idx_co):

    A = theta[:n*n].reshape(n,n)        
    inv = np.linalg.solve(np.eye(n)-A.dot(A.T), np.eye(n))
    g_log_bar = np.zeros(theta.size)                                 
    g_log_bar[:n*n] = 2 * inv.dot(A).reshape(-1,)

    gamma_g_log_bar = gamma * g_log_bar
    if gamma == 0:
        gamma_g_log_bar[np.invert(np.isfinite(gamma_g_log_bar))] = 0
    return g_l2_Hankel_sis(theta,k,l,n,Qs,idx_use,idx_co) + gamma_g_log_bar

max_iter = 1000
def converged(theta_old, theta, e, t):
    if t >= max_iter:
        return True
    return False

pars_est_vec, fs = adam_zip_stable(f_i,g_i,s,tau,pars_est_vec.copy(),a,a_A,b1,b2,e,max_iter,converged,Om,idx_grp,co_obs,batch_size)
plt.figure(figsize=(20,8))
plt.plot(fs[:max_iter])
plt.show()



A_est = pars_est_vec[:n*n].reshape(n,n)
B_est = pars_est_vec[n*n:2*n*n].reshape(n,n)
Pi_est = B_est.dot(B_est.T)
C_est = pars_est_vec[-p*n:].reshape(p,n)

pars_init = {'A': A_0, 'C': C_0, 'Pi': Pi_0, 'B': B_0}
pars_est  = {'A': A_est, 'C': C_est, 'Pi': Pi_est, 'B': B_est}
plot_outputs_l2_gradient_test(pars_true, pars_init, pars_est, k, l, Qs, 
                                   Qs_full, Om, Ovc, Ovw, f_i, None, if_flip = True)


# alternating blocked descent

- using SGD on $C$
- after each pass over the observed parts of the covariance matrix, use analyic solution for $A$, $\Pi$

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy.optimize import fmin_bfgs, check_grad
import glob, os

os.chdir('../core')
from utility import get_subpop_stats, draw_sys
from SSID_Hankel_loss import f_l2_Hankel, plot_outputs_l2_gradient_test
from SSID_Hankel_loss import l2_bad_sis_setup
from SSID_Hankel_loss import yy_Hankel_cov_mat, l2_sis_draw, adam_zip_bad_stable, id_A
from SSID_Hankel_loss import ssidSVD
os.chdir('../dev')

p,n = 2000,3
k,l = 3,3
k_init, l_init = 3,3

nr = 1
batch_size = p # batch_size = 1 (size-1 mini-batches), p (column mini-batches), None (full gradients)

a, b1, b2, e = 0.0001, 0.9, 0.9, 1e-8
max_iter, max_iter_init = 1000, 1000

reps = 1
if_stable = False

# create subpopulations
sub_pops = (np.arange(0,p//2+1), np.arange(p//2-1,p))

# draw system matrices    
ev_r = np.linspace(0.7, 0.99, nr)
ev_c = np.exp(2 * 1j * np.pi * np.random.uniform(size= (n - nr)//2))
ev_c = np.linspace(0.8, 0.99, (n - nr)//2) * ev_c


if p < 200:
    print('sub_pops', sub_pops)
obs_idx, idx_grp, co_obs, overlaps, overlap_grp, idx_overlap, Om, Ovw, Ovc = \
    get_subpop_stats(sub_pops, p, verbose=False)
pars_true, Qs, Qs_full = draw_sys(p,n,k,l,Om, nr, ev_r,ev_c)
f_i, g_C, g_A = l2_bad_sis_setup(k,l,n,Qs,Om,idx_grp,obs_idx, if_stable)

print('getting initial parameter values (SSID on largest subpopulation)')
idx = sub_pops[0]
H_kl = yy_Hankel_cov_mat(pars_true['C'][idx,:],pars_true['A'],pars_true['Pi'],
                         k_init,l_init,Om=None)
pars_ssid = ssidSVD(H_kl, Qs[0][np.ix_(idx,idx)], n, pi_method='proper')
U,S,VT = np.linalg.svd(pars_ssid['Pi'])
M = np.diag(1/np.sqrt(S)).dot(U)

for rep in range(reps):
        
    pars_init = {'A'  : M.dot(pars_ssid['A']).dot(np.linalg.inv(M)),
                 'Pi' : np.eye(n),
                 'B'  : np.eye(n), 
                 'C'  : np.random.normal(size=(p,n))}

    def converged(theta_old, theta, e, t):
        return True if t >= max_iter_init else False    
    pars_est_vec, fs = adam_zip_bad_stable(f_i,g_C,id_A,pars_init,
                                           10*a,b1,b2,e,max_iter_init,converged,
                                           Om,idx_grp,co_obs,
                                           batch_size)    
    print('starting descent')    
    def converged(theta_old, theta, e, t):
        return True if t >= max_iter else False
    pars_est_vec, fs = adam_zip_bad_stable(f_i,g_C,g_A,pars_est_vec,
                                           a,b1,b2,e,max_iter,converged,
                                           Om,idx_grp,co_obs,
                                           batch_size)
    pars_est  = {'A': pars_est_vec[:n*n].reshape(n,n), 
                 'C': pars_est_vec[-p*n:].reshape(p,n), 
                 'B': pars_est_vec[n*n:2*n*n].reshape(n,n)}
    pars_est['Pi'] = pars_est['B'].dot( pars_est['B'].T)

    plot_outputs_l2_gradient_test(pars_true, pars_init, pars_est, k, l, Qs, 
                                       Qs_full, Om, Ovc, Ovw, f_i, None, if_flip = True)

    plt.figure(figsize=(20,8))
    plt.plot(fs[:max_iter])
    plt.show()
    

getting initial parameter values (SSID on largest subpopulation)
using size-p mini-batches (coviarance columms)
f = 

# just one more turn...

In [None]:
if_stable = False
f_i, g_C, g_A = l2_bad_sis_setup(k,l,n,Qs,Om,idx_grp,obs_idx, if_stable)

max_iter = 1000
def converged(theta_old, theta, e, t):
    return True if t >= max_iter else False

pars_est_vec, fs = adam_zip_bad_stable(f_i,g_C,g_A,pars_est_vec.copy(),a,b1,b2,e,max_iter,converged,Om,idx_grp,co_obs,batch_size)

plt.figure(figsize=(20,8))
plt.plot(fs[:max_iter])
plt.show()


A_est = pars_est_vec[:n*n].reshape(n,n)
B_est = pars_est_vec[n*n:2*n*n].reshape(n,n)
Pi_est = B_est.dot(B_est.T)
C_est = pars_est_vec[-p*n:].reshape(p,n)
pars_est  = {'A': A_est, 'C': C_est, 'Pi': Pi_est, 'B': B_est}
plot_outputs_l2_gradient_test(pars_true, pars_init, pars_est, k, l, Qs, 
                                   Qs_full, Om, Ovc, Ovw, f_i, None, if_flip = True)

print(np.linalg.eigvals(A_est))


In [None]:
from scipy.io import savemat # store results for comparison with Matlab code   

pars_true_vec = np.hstack((A_true.reshape(n*n,),
                    B_true.reshape(n*n,),
                    C_true.reshape(p*n,)))

os.chdir('../fits/')

save_file = 'usbad_p1000n5r2_nice_2'

save_file_m = {'A_true':A_true,
               'B_true':B_true,
               'Pi_true' : Pi_true, 
               'C_true' : C_true,
               'A_0':A_0,
               'B_0':B_0,
               'Pi_0':Pi_0,
               'C_0':C_0,
               'A_est':A_est,
               'B_est':B_est,
               'Pi_est' : Pi_est, 
               'C_est' : C_est}

savemat(save_file,save_file_m) # does the actual saving

np.savez(save_file, 
         pars_0_vec=pars_0,
         pars_true_vec=pars_true_vec, 
         pars_est_vec=pars_est_vec)  

# Check log-barrier values and gradients

In [None]:
import numpy as np
import matplotlib.pypimport numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import fmin_bfgs, check_grad
n = 3
def f_i(theta):

    A = theta[:n*n].reshape(n,n)        
    f_log_bar = - np.log( np.linalg.det(np.eye(n)-A.dot(A.T)) )

    if np.linalg.det(np.eye(n)-A.dot(A.T)) < 0:
        print('negative det!')
        
    return f_log_bar
def g_i(theta):

    A = theta[:n*n].reshape(n,n)        
    inv = np.linalg.solve(np.eye(n)-A.dot(A.T), np.eye(n))
    g_log_bar = 2 * inv.dot(A)
                       
    return g_log_bar.reshape(-1,)
                       
%matplotlib inline
V = np.random.normal(size=(n,n))
V /= np.sqrt(np.sum(V**2,axis=0)).reshape(1,-1)
#theta = np.diag(2 * np.random.uniform(0,1, n) - 1)
theta = 0.99 * np.eye(n)
theta = V.dot(theta).dot(np.linalg.inv(V)).reshape(-1,)

print('A \n', theta.reshape(n,n))
print('eig(A) \n', np.sort(np.abs(np.linalg.eigvals(theta.reshape(n,n))))[::-1])
        
print('difference in gradient to finite-differencing value:', check_grad(f_i, g_i, theta))

max_iter = 10000
EVs = np.zeros(max_iter)
fs  = np.zeros(max_iter)
for i in range(max_iter):
    
    theta -= 0.00001 * g_i(theta)
    
    EVs[i] = np.mean(np.sort(np.abs(np.linalg.eigvals(theta.reshape(n,n))))[::-1])
    fs[i]  = f_i(theta)
    
    if np.mod(i, max_iter//5)==0:
        print('A \n', theta.reshape(n,n))
        print('eig(A) \n', np.sort(np.abs(np.linalg.eigvals(theta.reshape(n,n))))[::-1])
        print('\n f(A)', f_i(theta))
        print('\n')

plt.figure(figsize=(20,7))
plt.subplot(1,2,1)
plt.plot(EVs)
plt.subplot(1,2,2)
plt.plot(fs)
plt.show()                       ot as plt
from scipy.optimize import fmin_bfgs, check_grad
n = 3
def f_i(theta):

    A = theta[:n*n].reshape(n,n)        
    f_log_bar = - np.log( np.linalg.det(np.eye(n)-A.dot(A.T)) )

    if np.linalg.det(np.eye(n)-A.dot(A.T)) < 0:
        print('negative det!')
        
    return f_log_bar
def g_i(theta):

    A = theta[:n*n].reshape(n,n)        
    inv = np.linalg.solve(np.eye(n)-A.dot(A.T), np.eye(n))
    g_log_bar = 2 * inv.dot(A)
                       
    return g_log_bar.reshape(-1,)
                       
%matplotlib inline
V = np.random.normal(size=(n,n))
V /= np.sqrt(np.sum(V**2,axis=0)).reshape(1,-1)
#theta = np.diag(2 * np.random.uniform(0,1, n) - 1)
theta = 0.99 * np.eye(n)
theta = V.dot(theta).dot(np.linalg.inv(V)).reshape(-1,)

print('A \n', theta.reshape(n,n))
print('eig(A) \n', np.sort(np.abs(np.linalg.eigvals(theta.reshape(n,n))))[::-1])
        
print('difference in gradient to finite-differencing value:', check_grad(f_i, g_i, theta))

max_iter = 10000
EVs = np.zeros(max_iter)
fs  = np.zeros(max_iter)
for i in range(max_iter):
    
    theta -= 0.00001 * g_i(theta)
    
    EVs[i] = np.mean(np.sort(np.abs(np.linalg.eigvals(theta.reshape(n,n))))[::-1])
    fs[i]  = f_i(theta)
    
    if np.mod(i, max_iter//5)==0:
        print('A \n', theta.reshape(n,n))
        print('eig(A) \n', np.sort(np.abs(np.linalg.eigvals(theta.reshape(n,n))))[::-1])
        print('\n f(A)', f_i(theta))
        print('\n')

plt.figure(figsize=(20,7))
plt.subplot(1,2,1)
plt.plot(EVs)
plt.subplot(1,2,2)
plt.plot(fs)
plt.show()                       