In [1]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
import random
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.distributions import Normal as norm
from termcolor import colored
from sklearn.decomposition import PCA
from scipy import linalg as LA
from scipy.stats import multivariate_normal
from scipy.stats import norm
from scipy.stats import multinomial
from scipy.stats import logistic
from Auxiliary_functions import *
from Main_functions import *
from PCAfold import preprocess
from sklearn.decomposition import PCA

In [2]:
type_of_pre_process = 'center_scaling'

In [3]:
path_for_projected_data = 'hydrogen-combustion-3D-autoencoder-projection-of-state-space.csv'
path_for_actual_data = 'RawData/hydrogen-combustion-state-space.csv'
M_path = 'Matrices/hydrogen-combustion-3D-autoencoder-basis.csv'

In [4]:
# path_for_projected_data = 'syngas-combustion-3D-autoencoder-projection-of-state-space.csv'
# path_for_actual_data = 'RawData/syngas-combustion-state-space.csv'
# M_path = 'Matrices/syngas-combustion-3D-autoencoder-basis.csv'

In [5]:
M_true = genfromtxt(M_path, delimiter = ',')
M_true.shape

(9, 3)

In [6]:
d = M_true.shape[0]

In [7]:
X_raw = genfromtxt(path_for_actual_data, delimiter = ',')
X_raw.shape

(30000, 9)

In [8]:
X_projected = genfromtxt(path_for_projected_data, delimiter = ',')

In [9]:
n_labels = 2
n, d = X_raw.shape
n_trn = 10000
tau_true = 0.18
n_trn = 10000
n_runs = 20

## 1. Working directly with the raw data

### Train the model

In [10]:
Model_list_1 = Multi_run(n_runs = n_runs, d = d, k = d, 
                         X = X_raw, X_projected = X_projected, 
                         n_trn = n_trn, tau_true = tau_true,
                        n_iters = 10001)

trial number 0 begins.
Average squared distance is 0.3415671549142704
Far labeled pairs 50.7%
trial number 1 begins.
Average squared distance is 0.34635898148610783
Far labeled pairs 50.78%
trial number 2 begins.
Average squared distance is 0.3401472173524321
Far labeled pairs 49.806666666666665%
trial number 3 begins.
Average squared distance is 0.34194498210243945
Far labeled pairs 50.593333333333334%
trial number 4 begins.
Average squared distance is 0.3422395284528314
Far labeled pairs 50.32666666666667%
trial number 5 begins.
Average squared distance is 0.3443236081082643
Far labeled pairs 50.92%
trial number 6 begins.
Average squared distance is 0.3403983953899765
Far labeled pairs 50.42666666666667%
trial number 7 begins.
Average squared distance is 0.34231677250547754
Far labeled pairs 50.36666666666667%
trial number 8 begins.
Average squared distance is 0.3436305906887292
Far labeled pairs 50.85333333333333%
trial number 9 begins.
Average squared distance is 0.3423569660592719

In [11]:
train_accuracy_list, test_accuracy_lis, r_s_norm_k_list, r_f_norm_k_list = create_report(Model_list_1,
                                                                                        n_runs = n_runs,
                                                                                        tau_true = tau_true,
                                                                                        M_true = M_true,
                                                                                        k = None)

In [12]:
print('Mean train accuracy ({} trials) is {}'.format(n_runs, np.mean(train_accuracy_list)))
print('Mean test accuracy ({} trials) is {}'.format(n_runs, np.mean(test_accuracy_lis)))
print('Mean Realative Spectral Norm ({} trials) is {}'.format(n_runs, np.mean(r_s_norm_k_list)))
print('Mean Realative Frobenius Norm ({} trials) is {}'.format(n_runs, np.mean(r_f_norm_k_list)))

Mean train accuracy (20 trials) is 0.506
Mean test accuracy (20 trials) is 0.5052700000000001
Mean Realative Spectral Norm (20 trials) is 2.8755485931259598e+22
Mean Realative Frobenius Norm (20 trials) is 2.723464250162629e+22


# 2. Working with normalized data

In [13]:
(X_processed, centers, scales) = preprocess.center_scale(X_raw, scaling='0to1')
X_processed.shape

(30000, 9)

In [14]:
Model_list_2 = Multi_run(n_runs = n_runs, d = d, k = d, 
                         X = X_processed, X_projected = X_projected, 
                         n_trn = n_trn, tau_true = tau_true,
                        n_iters = 10001)

trial number 0 begins.
Average squared distance is 0.34181577114791206
Far labeled pairs 50.54666666666667%
trial number 1 begins.
Average squared distance is 0.34313320572865863
Far labeled pairs 49.88%
trial number 2 begins.
Average squared distance is 0.3416280441187976
Far labeled pairs 50.086666666666666%
trial number 3 begins.
Average squared distance is 0.34171470322816205
Far labeled pairs 50.333333333333336%
trial number 4 begins.
Average squared distance is 0.3450845063775451
Far labeled pairs 50.77333333333333%
trial number 5 begins.
Average squared distance is 0.34024117032926865
Far labeled pairs 50.92666666666667%
trial number 6 begins.
Average squared distance is 0.34192208699622995
Far labeled pairs 50.54%
trial number 7 begins.
Average squared distance is 0.3430994792053487
Far labeled pairs 50.58%
trial number 8 begins.
Average squared distance is 0.3414259851313652
Far labeled pairs 50.61333333333334%
trial number 9 begins.
Average squared distance is 0.3384797849430

In [15]:
train_accuracy_list, test_accuracy_lis, r_s_norm_k_list, r_f_norm_k_list = create_report(Model_list_2,
                                                                                         n_runs = n_runs,
                                                                                         tau_true = tau_true,
                                                                                         M_true = M_true,
                                                                                         k = None)

In [16]:
print('Normalized Data:')
print('Mean train accuracy ({} trials) is {}'.format(n_runs, np.mean(train_accuracy_list)))
print('Mean test accuracy ({} trials) is {}'.format(n_runs, np.mean(test_accuracy_lis)))
print('Mean Realative Spectral Norm ({} trials) is {}'.format(n_runs, np.mean(r_s_norm_k_list)))
print('Mean Realative Frobenius Norm ({} trials) is {}'.format(n_runs, np.mean(r_f_norm_k_list)))

Normalized Data:
Mean train accuracy (20 trials) is 0.9957199999999998
Mean test accuracy (20 trials) is 0.99512
Mean Realative Spectral Norm (20 trials) is 0.7481676427661519
Mean Realative Frobenius Norm (20 trials) is 0.7416287512939705


# 3. Covariance normalization

In [17]:
(X_processed, centers, scales) = preprocess.center_scale(X_raw, scaling='0to1')
X_processed.shape

(30000, 9)

In [18]:
C = np.cov(X_processed.T)

In [19]:
U_c, D_C, V_c = LA.svd(C, full_matrices = False)

In [20]:
D_C

array([2.08273920e-01, 8.67260556e-02, 5.95708577e-02, 4.16718818e-02,
       7.27538067e-03, 5.14832151e-03, 2.23848554e-04, 3.98176294e-05,
       9.36067147e-18])

In [21]:
sqrt_C_inv = U_c @ np.diag(np.sqrt(D_C)**-1) @ V_c

In [22]:
sqrt_C = U_c @ np.diag(np.sqrt(D_C)) @ V_c

In [23]:
X_processed_normal = X_processed @ sqrt_C_inv

In [24]:
M_true_normal = sqrt_C @ M_true

In [25]:
M_true_normal[0]

array([-0.02230746, -0.00626123,  0.13828542])

In [26]:
# X_projected = genfromtxt(path_for_projected_data, delimiter = ',')
# X_projected_ = X_processed_normal @ M_true_normal
# K = np.abs(X_projected_ - X_projected) > 1e-6
# len(np.where(K.sum(axis =1) > 0)[0])

In [27]:
Model_list_3 = Multi_run(n_runs = n_runs, d = d, k = d, 
                         X = X_processed_normal, X_projected = X_projected, 
                         n_trn = n_trn, tau_true = tau_true,
                        n_iters = 10001)

trial number 0 begins.
Average squared distance is 0.3403486947359308
Far labeled pairs 50.233333333333334%
trial number 1 begins.
Average squared distance is 0.34419781881721506
Far labeled pairs 50.54%
trial number 2 begins.
Average squared distance is 0.3421442889721755
Far labeled pairs 50.193333333333335%
trial number 3 begins.
Average squared distance is 0.3406357584447579
Far labeled pairs 49.32%
trial number 4 begins.
Average squared distance is 0.34161996144547374
Far labeled pairs 50.72666666666667%
trial number 5 begins.
Average squared distance is 0.33783939113789274
Far labeled pairs 50.233333333333334%
trial number 6 begins.
Average squared distance is 0.34016865213252256
Far labeled pairs 50.28%
trial number 7 begins.
Average squared distance is 0.3428054661676667
Far labeled pairs 50.233333333333334%
trial number 8 begins.
Average squared distance is 0.3408244687128517
Far labeled pairs 50.43333333333333%
trial number 9 begins.
Average squared distance is 0.340769917156

In [28]:
train_accuracy_list, test_accuracy_lis, r_s_norm_k_list, r_f_norm_k_list = create_report(Model_list_3,
                                                                                        n_runs = n_runs,
                                                                                        tau_true = tau_true,
                                                                                        M_true = M_true_normal,
                                                                                        k = None)

In [29]:
print('Covariance normalization:')
print('Mean train accuracy ({} trials) is {}'.format(n_runs, np.mean(train_accuracy_list)))
print('Mean test accuracy ({} trials) is {}'.format(n_runs, np.mean(test_accuracy_lis)))
print('Mean Realative Spectral Norm ({} trials) is {}'.format(n_runs, np.mean(r_s_norm_k_list)))
print('Mean Realative Frobenius Norm ({} trials) is {}'.format(n_runs, np.mean(r_f_norm_k_list)))

Covariance normalization:
Mean train accuracy (20 trials) is 0.9981450000000001
Mean test accuracy (20 trials) is 0.9972800000000002
Mean Realative Spectral Norm (20 trials) is 0.04132048288016264
Mean Realative Frobenius Norm (20 trials) is 0.045988134828252474


In [61]:
for k in range(d):
    train_accuracy_list_k, test_accuracy_lis_k, r_s_norm_k_list_k, r_f_norm_k_list_k = create_report(Model_list_3,
                                                                                        n_runs = n_runs,
                                                                                        tau_true = tau_true,
                                                                                        M_true = M_true_normal,
                                                                                        k = k)
    print('Covariance normalization trancated $M_hat_{}:'.format(k))
    print('Mean train accuracy ({} trials) is {}'.format(n_runs, np.mean(train_accuracy_list_k)))
    print('Mean test accuracy ({} trials) is {}'.format(n_runs, np.mean(test_accuracy_lis_k)))
    print('Mean Realative Spectral Norm ({} trials) is {}'.format(n_runs, np.mean(r_s_norm_k_list_k)))
    print('Mean Realative Frobenius Norm ({} trials) is {}\n'.format(n_runs, np.mean(r_f_norm_k_list_k)))

Covariance normalization trancated $M_hat_0:
Mean train accuracy (20 trials) is 0.49749499999999997
Mean test accuracy (20 trials) is 0.49578000000000005
Mean Realative Spectral Norm (20 trials) is 1.0
Mean Realative Frobenius Norm (20 trials) is 1.0

Covariance normalization trancated $M_hat_1:
Mean train accuracy (20 trials) is 0.89774
Mean test accuracy (20 trials) is 0.89748
Mean Realative Spectral Norm (20 trials) is 0.30094735640574893
Mean Realative Frobenius Norm (20 trials) is 0.29388793865997037

Covariance normalization trancated $M_hat_2:
Mean train accuracy (20 trials) is 0.9787249999999998
Mean test accuracy (20 trials) is 0.97855
Mean Realative Spectral Norm (20 trials) is 0.052292734341700266
Mean Realative Frobenius Norm (20 trials) is 0.06363499343305776

Covariance normalization trancated $M_hat_3:
Mean train accuracy (20 trials) is 0.998105
Mean test accuracy (20 trials) is 0.9972099999999999
Mean Realative Spectral Norm (20 trials) is 0.041303981112096
Mean Realati

### Change only some directions based on Covaraince eigenvectors

In [30]:
(X_processed, centers, scales) = preprocess.center_scale(X_raw, scaling='0to1')
X_processed.shape

(30000, 9)

In [31]:
C = np.cov(X_processed.T)
U_c, D_C, V_c = LA.svd(C, full_matrices = False)
D_C

array([2.08273920e-01, 8.67260556e-02, 5.95708577e-02, 4.16718818e-02,
       7.27538067e-03, 5.14832151e-03, 2.23848554e-04, 3.98176294e-05,
       9.36067147e-18])

In [32]:
D_rescaled = np.ones(D_C.shape)
D_rescaled[-1] = D_C[-1]
D_rescaled[-2] = D_C[-2]
D_rescaled[-3] = D_C[-3]
D_rescaled[-4] = D_C[-4]
D_rescaled[-5] = D_C[-5]
Re_scale_matrix_inv =  U_c @ np.diag(np.sqrt(D_rescaled)**-1) @ V_c
Re_scale_matrix =  U_c @ np.diag(np.sqrt(D_rescaled)) @ V_c

In [33]:
X_processed_rescaled = X_processed @ Re_scale_matrix_inv

In [34]:
M_true_rescaled = Re_scale_matrix @ M_true

In [35]:
Model_list_4 = Multi_run(n_runs = n_runs, d = d, k = d, 
                         X = X_processed_rescaled, X_projected = X_projected, 
                         n_trn = n_trn, tau_true = tau_true,
                        n_iters = 10001, show_log = False)

trial number 0 begins.
Average squared distance is 0.3405529902841727
Far labeled pairs 50.36666666666667%
trial number 1 begins.
Average squared distance is 0.34134225158246884
Far labeled pairs 50.63333333333333%
trial number 2 begins.
Average squared distance is 0.3430093744215146
Far labeled pairs 50.54666666666667%
trial number 3 begins.
Average squared distance is 0.34328090156076874
Far labeled pairs 50.46%
trial number 4 begins.
Average squared distance is 0.34372235162731996
Far labeled pairs 51.153333333333336%
trial number 5 begins.
Average squared distance is 0.34246760144171984
Far labeled pairs 50.79333333333334%
trial number 6 begins.
Average squared distance is 0.3434814968603526
Far labeled pairs 50.60666666666667%
trial number 7 begins.
Average squared distance is 0.3452109052725585
Far labeled pairs 50.74666666666667%
trial number 8 begins.
Average squared distance is 0.3440731854835795
Far labeled pairs 51.42666666666667%
trial number 9 begins.
Average squared dista

In [36]:
train_accuracy_list, test_accuracy_lis, r_s_norm_k_list, r_f_norm_k_list = create_report(Model_list_4,
                                                                                        n_runs = n_runs,
                                                                                        tau_true = tau_true,
                                                                                        M_true = M_true_rescaled,
                                                                                        k = None)

In [37]:
# r_s_norm_k_list

In [38]:
# r_f_norm_k_list

In [39]:
print('Covariance selective normalization:')
print('Mean train accuracy ({} trials) is {}'.format(n_runs, np.mean(train_accuracy_list)))
print('Mean test accuracy ({} trials) is {}'.format(n_runs, np.mean(test_accuracy_lis)))
print('Mean Realative Spectral Norm ({} trials) is {}'.format(n_runs, np.mean(r_s_norm_k_list)))
print('Mean Realative Frobenius Norm ({} trials) is {}'.format(n_runs, np.mean(r_f_norm_k_list)))

Covariance selective normalization:
Mean train accuracy (20 trials) is 0.9978200000000002
Mean test accuracy (20 trials) is 0.9969199999999999
Mean Realative Spectral Norm (20 trials) is 0.045575191324000654
Mean Realative Frobenius Norm (20 trials) is 0.04691193959682218


### The directions that $X$ has no variance on them do not matter for $M$ at all

In [40]:
C = np.cov(X_processed.T)
U_c, D_C, V_c = LA.svd(C, full_matrices = False)
D_C

array([2.08273920e-01, 8.67260556e-02, 5.95708577e-02, 4.16718818e-02,
       7.27538067e-03, 5.14832151e-03, 2.23848554e-04, 3.98176294e-05,
       9.36067147e-18])

In [41]:
D_rescaled = np.ones(D_C.shape)
D_rescaled[-1] = 0
D_rescaled[-2] = 0
D_rescaled[-3] = 0
# D_rescaled[-4] = 0
# D_rescaled[-5] = 0
# D_rescaled[-6] = 0
# Re_scale_matrix_inv =  U_c @ np.diag(np.sqrt(D_rescaled)**-1) @ V_c
Re_scale_matrix =  U_c @ np.diag(np.sqrt(D_rescaled)) @ U_c.T

In [42]:
M_true_rescaled = Re_scale_matrix @ M_true

In [43]:
X_projected = genfromtxt(path_for_projected_data, delimiter = ',')
X_projected_ = X_processed @ M_true_rescaled
K = np.abs(X_projected_ - X_projected) > 1e-3
len(np.where(K.sum(axis =1) > 0)[0])

30000

In [44]:
LA.norm(X_projected_[500] - X_projected_[14000])

0.9962696039381317

In [45]:
LA.norm(X_projected[500] - X_projected[14000])

0.9950315860354604

In [46]:
Model_list_5 = Multi_run(n_runs = n_runs, d = d, k = d, 
                         X = X_processed, X_projected = X_projected, 
                         n_trn = n_trn, tau_true = tau_true,
                        n_iters = 20001, show_log = False)

trial number 0 begins.
Average squared distance is 0.3407682187596569
Far labeled pairs 49.99333333333333%
trial number 1 begins.
Average squared distance is 0.34302519925915187
Far labeled pairs 51.36%
trial number 2 begins.
Average squared distance is 0.3438394171108607
Far labeled pairs 50.89333333333333%
trial number 3 begins.
Average squared distance is 0.3373585851572562
Far labeled pairs 50.473333333333336%
trial number 4 begins.
Average squared distance is 0.3415589976585544
Far labeled pairs 50.4%
trial number 5 begins.
Average squared distance is 0.3422419539511573
Far labeled pairs 51.22%
trial number 6 begins.
Average squared distance is 0.34380255881260385
Far labeled pairs 50.733333333333334%
trial number 7 begins.
Average squared distance is 0.34387036440431207
Far labeled pairs 50.38666666666666%
trial number 8 begins.
Average squared distance is 0.34056730767503746
Far labeled pairs 50.766666666666666%
trial number 9 begins.
Average squared distance is 0.33833544052575

In [47]:
train_accuracy_list, test_accuracy_lis, r_s_norm_k_list, r_f_norm_k_list = create_report(Model_list_5,
                                                                                        n_runs = n_runs,
                                                                                        tau_true = tau_true,
                                                                                        M_true = M_true_rescaled,
                                                                                        k = None)

In [48]:
print('M_true normalization:')
print('Mean train accuracy ({} trials) is {}'.format(n_runs, np.mean(train_accuracy_list)))
print('Mean test accuracy ({} trials) is {}'.format(n_runs, np.mean(test_accuracy_lis)))
print('Mean Realative Spectral Norm ({} trials) is {}'.format(n_runs, np.mean(r_s_norm_k_list)))
print('Mean Realative Frobenius Norm ({} trials) is {}'.format(n_runs, np.mean(r_f_norm_k_list)))

M_true normalization:
Mean train accuracy (20 trials) is 0.9967450000000001
Mean test accuracy (20 trials) is 0.9962900000000001
Mean Realative Spectral Norm (20 trials) is 0.2507505406485447
Mean Realative Frobenius Norm (20 trials) is 0.30587536931195786
