# Stochastic gradient descent
- three variants
    - size-1 mini-batches (batch_size = 1), zips through all observed entries of $\mbox{cov}(y_{t+m},y_t)$ and computes gradients from one at time
    - column mini-batches (batch_size = p), zips through all columns of $\mbox{cov}(y_{t+m},y_t)$ and computes gradients from observed entries 
    - batch-gradients (batch_size = None), computes a full gradient using all observed entries $\mbox{cov}(y_{t+m},y_t)$ at the same time

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy.optimize import fmin_bfgs, check_grad
import glob, os

os.chdir('../core')
from utility import get_subpop_stats, comp_model_covariances
from SSID_Hankel_loss import l2_sis_setup, g_l2_Hankel_sis, plot_outputs_l2_gradient_test, yy_Hankel_cov_mat
from SSID_Hankel_loss import l2_sis_draw, adam_zip
os.chdir('../dev')

p,n = 10,3
k,l = 3,3

batch_size = None

a, b1, b2, e = 0.001, 0.9, 0.99, 1e-8
max_iter = 1000

# create subpopulations
sub_pops = (np.arange(0,p//2+1), np.arange(p//2-1,p))
if p < 200:
    print('sub_pops', sub_pops)
obs_idx, idx_grp, co_obs, overlaps, overlap_grp, idx_overlap, Om, Ovw, Ovc = \
    get_subpop_stats(sub_pops, p, verbose=False)

for rep in range(1):
    #"""
    C_true      = np.random.normal(size=(p,n))
    
    V = np.random.normal(size=(n,n))
    V /= np.sqrt(np.sum(V**2,axis=0)).reshape(1,-1)
    A_true = V.dot(np.diag(np.linspace(0.7, 0.95, n))).dot(np.linalg.inv(V))
    
    B_true      = np.random.normal(size=(n,n))/np.sqrt(n)    
    Pi_true     = B_true.dot(B_true.T) #np.eye(n) 
    
    Qs = comp_model_covariances({'A': A_true, 'Pi': Pi_true, 'C': C_true}, k+l, Om)
    Qs_full = comp_model_covariances({'A': A_true, 'Pi': Pi_true, 'C': C_true}, k+l)

    
    A_0  = np.diag(np.random.uniform(low=0.7, high=0.8, size=n))
    B_0  = np.eye(n) #np.random.normal(size=(n,n))
    Pi_0 = B_0.dot(B_0.T)
    C_0  = np.random.normal(size=(p,n))
    pars_0 = np.hstack((A_0.reshape(n*n,),
                        B_0.reshape(n*n,),
                        C_0.reshape(p*n,)))

    f_i, _ = l2_sis_setup(k,l,n,Qs,Om,idx_grp,obs_idx)
    def g_i(theta, idx_use, idx_co):
        return g_l2_Hankel_sis(theta,k,l,n,Qs,idx_use,idx_co)
    
    #"""

    def converged(theta_old, theta, e, t):
        if t >= max_iter:
            return True
        return np.abs(f_i(theta_old) - f_i(theta)) < e
    
    print('starting descent')
    pars_est_vec, fs = adam_zip(f_i,g_i,pars_0.copy(),a,b1,b2,e,max_iter,converged,Om,idx_grp,co_obs,batch_size)
    
    A_est = pars_est_vec[:n*n].reshape(n,n)
    B_est = pars_est_vec[n*n:2*n*n].reshape(n,n)
    Pi_est = B_est.dot(B_est.T)
    C_est = pars_est_vec[-p*n:].reshape(p,n)
    
    pars_init = {'A': A_0, 'C': C_0, 'Pi': Pi_0, 'B': B_0}
    pars_est  = {'A': A_est, 'C': C_est, 'Pi': Pi_est, 'B': B_est}
    pars_true = {'A': A_true, 'C': C_true, 'Pi': Pi_true, 'B': B_true}
    plot_outputs_l2_gradient_test(pars_true, pars_init, pars_est, k, l, Qs, 
                                       Qs_full, Om, Ovc, Ovw, f_i, g_i, if_flip = True)
    
    plt.figure(figsize=(20,8))
    plt.plot(fs[:max_iter])
    plt.show()
    


# Do another round (same number of steps)

In [None]:
pars_est_vec, fs = adam_zip(f_i,g_i,pars_est_vec.copy(),a,b1,b2,e,converged,Om,idx_grp,co_obs,v_0=v_0)

In [None]:
A_est = pars_est_vec[:n*n].reshape(n,n)
B_est = pars_est_vec[n*n:2*n*n].reshape(n,n)
Pi_est = B_est.dot(B_est.T)
C_est = pars_est_vec[-p*n:].reshape(p,n)

pars_init = {'A': A_0, 'C': C_0, 'Pi': Pi_0, 'B': B_0}
pars_est  = {'A': A_est, 'C': C_est, 'Pi': Pi_est, 'B': B_est}
pars_true = {'A': A_true, 'C': C_true, 'Pi': Pi_true, 'B': B_true}
plot_outputs_l2_gradient_test(pars_true, pars_init, pars_est, 5, 5, Qs, 
                                   Qs_full, Om, Ovc, Ovw, f_i, g_i, if_flip = True)

plt.figure(figsize=(20,8))
plt.plot(fs[:max_iter])
plt.show()


In [None]:
os.chdir('../core')
from utility import get_subpop_stats, comp_model_covariances
from SSID_Hankel_loss import l2_sis_setup, g_l2_Hankel_sis, plot_outputs_l2_gradient_test, yy_Hankel_cov_mat
os.chdir('../dev')

A_est = pars_est_vec[:n*n].reshape(n,n).copy()
B_est = pars_est_vec[n*n:2*n*n].reshape(n,n).copy()
Pi_est = B_est.dot(B_est.T).copy()
C_est = pars_est_vec[-p*n:].reshape(p,n).copy()

H_true = yy_Hankel_cov_mat(C_true,A_true,Pi_true,5,5,np.ones((p,p),dtype=bool))        
H_emp = yy_Hankel_cov_mat(C_true,A_true,Pi_true,5,5,Om)
H_emp[np.where(H_emp==0)] = np.nan


H_est = yy_Hankel_cov_mat(C_est,A_est,Pi_est,5,5,np.ones((p,p),dtype=bool))

H_obs = yy_Hankel_cov_mat(C_est,A_est,Pi_est,5,5,Om)
H_obs[np.where(H_obs==0)] = np.nan

plt.figure(figsize=(20,9))
plt.subplot(1,2,1)
plt.plot(H_true[np.isnan(H_emp)], H_est[np.isnan(H_obs)], 'k.')
plt.xlabel('true un-observed covariances(all time lags)')
plt.ylabel('est. stitchted covariances(all time lags)')
plt.subplot(1,2,2)
plt.plot(H_emp, H_obs, 'k.')
plt.xlabel('true observed covariances(all time lags)')
plt.ylabel('est. observed covariances(all time lags)')
plt.show()

In [None]:
from scipy.io import savemat # store results for comparison with Matlab code   

pars_true_vec = np.hstack((A_true.reshape(n*n,),
                    B_true.reshape(n*n,),
                    C_true.reshape(p*n,)))

os.chdir('../fits/')

save_file = 'p500n20r2.npz'

save_file_m = {'A_true':A_true,
               'B_true':B_true,
               'Pi_true' : Pi_true, 
               'C_true' : C_true,
               'A_0':A_0,
               'B_0':B_0,
               'Pi_0':Pi_0,
               'C_0':C_0,
               'A_est':A_est,
               'B_est':B_est,
               'Pi_est' : Pi_est, 
               'C_est' : C_est}

savemat(save_file,save_file_m) # does the actual saving

np.savez(save_file, 
         pars_0_vec=pars_0,
         pars_true_vec=pars_true_vec, 
         pars_est_vec=pars_est_vec)  