In [1]:
import numpy as np
from sklearn.datasets import make_sparse_spd_matrix
from scipy import linalg as LA
import pandas as pd
import time, os
import matplotlib.pyplot as plt

from pyinstrument import Profiler

from infoband.band_info import InfoCorrBand
from wlpy.covariance import Covariance
from utils.adpt_correlation_threshold import AdptCorrThreshold
from wlpy.gist import heatmap

import warnings
warnings.filterwarnings("ignore")

from my_api import *

# data processing

Use the error rate as our metric.
$$ \frac{\| A - \hat A \|}{\| A \|} $$

### Results of other methods

In [2]:
df = pd.read_csv('other_methods.csv')
df.drop(columns = df.columns[0], inplace = True)
df.head(3)

Unnamed: 0,N,T,rho,ord,type,Sample,Soft Threshold,Hard Threshold,Linear Shrink,Nonlinear Shrink
0,100,100,0.8,fro,R,0.437159,0.412584,0.437159,0.405599,
1,100,100,0.8,fro,S,0.468781,0.423442,0.468781,0.419056,
2,100,100,0.8,2,R,0.419901,0.411377,0.348096,0.455421,


### Results of our proposed methods

In [3]:
data_path = 'data_2023-01-17/'

In [4]:
def compute_error(row, eta):
    N, T, rho, ord, type = row['N'], row['T'], row['rho'], row['ord'], row['type']
    if type == 'R':
        cov_cor = 'cor'
    else:
        cov_cor = 'cov'
    file_name = get_title_1(ord = ord, cov_cor = cov_cor, eta = eta, N = N, T = T, rho = rho, draw_type = 'random')
    file_path = data_path + file_name + '.txt'
    try:
        data = np.loadtxt(file_path)
        return data.mean()
    except FileNotFoundError as e:
        return None

In [5]:
eta_range = [0.5, 0.8, 0.95, 1]
for eta in eta_range:
    df['Info Band eta={}'.format(eta)] = df.apply(func = compute_error, axis = 1, args = (eta, ))
df.head(3)

Unnamed: 0,N,T,rho,ord,type,Sample,Soft Threshold,Hard Threshold,Linear Shrink,Nonlinear Shrink,Info Band eta=0.5,Info Band eta=0.8,Info Band eta=0.95,Info Band eta=1
0,100,100,0.8,fro,R,0.437159,0.412584,0.437159,0.405599,,0.423696,0.330569,0.225972,0.226556
1,100,100,0.8,fro,S,0.468781,0.423442,0.468781,0.419056,,0.449804,0.365138,0.276803,0.277256
2,100,100,0.8,2,R,0.419901,0.411377,0.348096,0.455421,,0.363267,0.298962,0.237871,0.238219


# find conditions with best performance

Given (N, T, rho, ord, type), find the method with lowest error rate in the list $other\_cols$, which is defined as:

In [6]:
other_cols = ['Sample', 'Soft Threshold', 'Hard Threshold', 'Linear Shrink', 'Nonlinear Shrink']

In [7]:
# type(row) = pandas.Series, with indexes ['Sample', 'Soft Threshold', ...]
get_min_key = lambda row: min(row.index, key = lambda x: row[x]) 
df['min_other_key'] = df[other_cols].apply(func = get_min_key, axis = 1)
get_min_val = lambda row: min(row)
df['min_other_val'] = df[other_cols].apply(func = get_min_val, axis = 1)
df['min_other_key'].value_counts()

Soft Threshold      39
Linear Shrink       12
Hard Threshold      12
Nonlinear Shrink     7
Sample               2
Name: min_other_key, dtype: int64

For each row, given $\eta$, use the ratio 
$$ \frac{\text{lowest error rate of other methods (i.e. }min\_other\_val)}{\text{error rate of the proposed estimator}} $$

as a rule to find the parameters (N, T, rho) with best results.

In [8]:
for eta in eta_range:
    df['ratio_eta={}'.format(eta)] = df['min_other_val'] / df['Info Band eta={}'.format(eta)]
df.head(3)

Unnamed: 0,N,T,rho,ord,type,Sample,Soft Threshold,Hard Threshold,Linear Shrink,Nonlinear Shrink,Info Band eta=0.5,Info Band eta=0.8,Info Band eta=0.95,Info Band eta=1,min_other_key,min_other_val,ratio_eta=0.5,ratio_eta=0.8,ratio_eta=0.95,ratio_eta=1
0,100,100,0.8,fro,R,0.437159,0.412584,0.437159,0.405599,,0.423696,0.330569,0.225972,0.226556,Linear Shrink,0.405599,0.957286,1.226973,1.794911,1.790279
1,100,100,0.8,fro,S,0.468781,0.423442,0.468781,0.419056,,0.449804,0.365138,0.276803,0.277256,Linear Shrink,0.419056,0.931642,1.147666,1.513913,1.511442
2,100,100,0.8,2,R,0.419901,0.411377,0.348096,0.455421,,0.363267,0.298962,0.237871,0.238219,Hard Threshold,0.348096,0.958236,1.164347,1.463378,1.461242


Sort the computed ratio.

In [35]:
line_num = 2
param_cols = ['N', 'T', 'rho', 'ord', 'type']
ratio_cols = ['ratio_eta={}'.format(eta) for eta in eta_range]
for eta in eta_range:
    # ratio_col = 'ratio_eta={}'.format(eta)
    tmp = df.sort_values(by = ratio_col, ascending = False)[param_cols + ratio_cols]
    print(tmp.head(line_num))

      N    T  rho  ord type  ratio_eta=0.5  ratio_eta=0.8  ratio_eta=0.95  \
66  500  500  0.8    2    R       1.235967       1.956332        2.559690   
40  300  500  0.8  fro    R       0.947608       1.382823        2.569774   

    ratio_eta=1  
66     2.826260  
40     2.820157  
      N    T  rho  ord type  ratio_eta=0.5  ratio_eta=0.8  ratio_eta=0.95  \
66  500  500  0.8    2    R       1.235967       1.956332        2.559690   
40  300  500  0.8  fro    R       0.947608       1.382823        2.569774   

    ratio_eta=1  
66     2.826260  
40     2.820157  
      N    T  rho  ord type  ratio_eta=0.5  ratio_eta=0.8  ratio_eta=0.95  \
66  500  500  0.8    2    R       1.235967       1.956332        2.559690   
40  300  500  0.8  fro    R       0.947608       1.382823        2.569774   

    ratio_eta=1  
66     2.826260  
40     2.820157  
      N    T  rho  ord type  ratio_eta=0.5  ratio_eta=0.8  ratio_eta=0.95  \
66  500  500  0.8    2    R       1.235967       1.956332        

In [23]:
help('pandas.DataFrame.sort_values')

Help on function sort_values in pandas.DataFrame:

pandas.DataFrame.sort_values = sort_values(self, by: 'IndexLabel', axis: 'Axis' = 0, ascending: 'bool | list[bool] | tuple[bool, ...]' = True, inplace: 'bool' = False, kind: 'str' = 'quicksort', na_position: 'str' = 'last', ignore_index: 'bool' = False, key: 'ValueKeyFunc' = None) -> 'DataFrame | None'
    Sort by the values along either axis.
    
    Parameters
    ----------
            by : str or list of str
                Name or list of names to sort by.
    
                - if `axis` is 0 or `'index'` then `by` may contain index
                  levels and/or column labels.
                - if `axis` is 1 or `'columns'` then `by` may contain column
                  levels and/or index labels.
    axis : {0 or 'index', 1 or 'columns'}, default 0
         Axis to be sorted.
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         orders.  If this is a

In [16]:
df.iloc[50]

N                                500
T                                100
rho                              0.8
ord                                2
type                               R
Sample                      1.476639
Soft Threshold               0.49666
Hard Threshold              0.883962
Linear Shrink               0.763013
Nonlinear Shrink                 NaN
Info Band eta=0.5           0.338383
Info Band eta=0.8           0.250428
Info Band eta=0.95          0.217734
Info Band eta=1             0.224883
min_other_key         Soft Threshold
min_other_val                0.49666
ratio_eta=0.5               1.467744
ratio_eta=0.8               1.983244
ratio_eta=0.95              2.281038
ratio_eta=1                 2.208524
Name: 50, dtype: object

0.4055987011878901

In [86]:
get_min_key(x)

'Linear Shrink'

In [64]:
df[['min_other_key', 'Sample']]

Unnamed: 0,min_other_key,Sample
0,0.437159,0.437159
1,0.468781,0.468781
2,0.419901,0.419901
3,0.487316,0.487316
4,0.183964,0.183964
...,...,...
67,0.584031,0.584031
68,0.221329,0.221329
69,0.233995,0.233995
70,0.236340,0.236340


In [10]:
df[df['Info Band eta=0.5'].isnull() == False].head()

Unnamed: 0,N,T,rho,ord,type,Sample,Soft Threshold,Hard Threshold,Linear Shrink,Nonlinear Shrink,Info Band eta=0.5,Info Band eta=0.8,Info Band eta=0.95,Info Band eta=1
24,300,100,0.8,fro,R,0.7884,0.341773,0.880962,0.575247,,0.391015,0.299953,0.220001,0.220003
25,300,100,0.8,fro,S,0.839914,0.359178,0.884476,0.581832,,0.433204,0.357403,0.295806,0.295807
26,300,100,0.8,2,R,0.978462,0.398644,0.887738,0.635389,,0.289494,0.196713,0.198978,
27,300,100,0.8,2,S,1.118217,0.444021,0.887002,0.631918,,0.324393,0.27838,0.333266,
28,300,100,0.95,fro,R,0.345589,0.288438,0.973157,0.310277,,0.447366,0.321296,0.231754,0.204468


# Query

In [44]:
tmp = df[(df['ord'] == 'fro') & (df['type'] == 'S') & (df['N'] > df['T'])]
tmp.shape

(6, 14)

In [45]:
tmp

Unnamed: 0,N,T,rho,ord,type,Sample,Soft Threshold,Hard Threshold,Linear Shrink,Nonlinear Shrink,Info Band eta=0.5,Info Band eta=0.8,Info Band eta=0.95,Info Band eta=1
25,300,100,0.8,fro,S,0.839914,0.359178,0.884476,0.581832,,0.433204,0.357403,0.295806,0.295807
29,300,100,0.95,fro,S,0.439947,0.285761,0.97401,0.349928,,0.497606,0.394539,0.327522,0.309901
49,500,100,0.8,fro,S,1.047401,0.41941,0.880104,0.679519,,0.396283,0.319619,0.262429,0.267048
53,500,100,0.95,fro,S,0.518841,0.392081,0.973919,0.450736,,0.47564,0.356634,0.274209,0.251642
57,500,300,0.8,fro,S,0.609856,0.273069,0.883377,0.508406,,0.33524,0.244207,0.17148,0.164404
61,500,300,0.95,fro,S,0.314768,0.2355,0.973713,0.297102,,0.408357,0.295583,0.202708,0.169351


In [47]:
tmp = df[(df['N'] == 500)]
tmp.head()

Unnamed: 0,N,T,rho,ord,type,Sample,Soft Threshold,Hard Threshold,Linear Shrink,Nonlinear Shrink,Info Band eta=0.5,Info Band eta=0.8,Info Band eta=0.95,Info Band eta=1
48,500,100,0.8,fro,R,1.046652,0.403661,0.877496,0.677967,,0.371348,0.285833,0.218473,0.224191
49,500,100,0.8,fro,S,1.047401,0.41941,0.880104,0.679519,,0.396283,0.319619,0.262429,0.267048
50,500,100,0.8,2,R,1.476639,0.49666,0.883962,0.763013,,0.338383,0.250428,0.217734,0.224883
51,500,100,0.8,2,S,1.469895,0.530165,0.888774,0.766108,,0.396817,0.32242,0.296515,0.299564
52,500,100,0.95,fro,R,0.499266,0.379502,0.973517,0.438361,,0.459265,0.331139,0.237467,0.21042
53,500,100,0.95,fro,S,0.518841,0.392081,0.973919,0.450736,,0.47564,0.356634,0.274209,0.251642
54,500,100,0.95,2,R,0.638519,0.726931,0.974028,0.447506,,0.320237,0.225475,0.179517,0.171598
55,500,100,0.95,2,S,0.691658,0.737535,0.974306,0.46681,,0.367239,0.282871,0.248989,0.260641
56,500,300,0.8,fro,R,0.611694,0.258052,0.882593,0.505437,,0.324812,0.22863,0.147491,0.139134
57,500,300,0.8,fro,S,0.609856,0.273069,0.883377,0.508406,,0.33524,0.244207,0.17148,0.164404
