# Heuristic Code

In [1]:
import numpy as np
from lpm_methods_python import pca as find_subset


# Examples from "Solving Large-Scale Sparse PCA to Certifiable (Near) Optimality"

The data and optimal values of the optimization problems can be found on the github page associated to the paper "Solving Large-Scale Sparse PCA to Certifiable (Near) Optimality", at https://github.com/ryancorywright/ScalableSPCA.jl

In [2]:
import time
def max_eigenvalue(A, T):
    return max(np.linalg.eigvalsh([[A[i,j] for i in T] for j in T]))
# Runs the heuristic on a dataset with fixed k, and reports relevant information.
def test(data, k, opt_val, redund=2, method = find_subset):
    start_time = time.time()
    T = method(data, k, redund = redund)
    total_time = time.time() - start_time
    print("Found set: ", T)
    value = max_eigenvalue(data, T)
    print("with value ",value)
    print("With gap ", (opt_val - value)/opt_val)
    print("in {} seconds".format(total_time))
    print("")

In [69]:
def easy_heuristic(A, k, redund = None):
    T = []
    for t in range(k):
        best = -100
        best_i = -1
        for i in range(len(A)):
            if i in T:
                continue
            score = max_eigenvalue(A, T + [i])
            if score > best:
                best = score
                best_i = i
        T.append(best_i)
    return T

## Wine dataset

In [5]:
# This dataset was taken directly from ScalableSPCA.jl
normwine=[[1.0       ,  0.0943969 ,  0.211545   , -0.310235  ,  0.270798  ,  0.289101 ,   0.236815 , -0.155929 ,  0.136698    , 0.546364   ,-0.0717472 ,  0.0723432 ,   0.64372   ],
[0.0943969 ,  1.0       ,  0.164045   ,  0.2885    , -0.0545751 , -0.335167 ,  -0.411007 ,  0.292977 , -0.220746    , 0.248985   ,-0.561296  , -0.36871   ,  -0.192011  ],
[0.211545  ,  0.164045  ,  1.0        ,  0.443367  ,  0.286587  ,  0.12898  ,   0.115077 ,  0.18623  ,  0.00965194  , 0.258887   ,-0.0746669 ,  0.00391123,   0.223626  ],
[-0.310235 ,   0.2885   ,   0.443367  ,   1.0      ,  -0.0833331,  -0.321113,   -0.35137 ,   0.361922,  -0.197327   ,  0.018732  , -0.273955 ,  -0.276769 ,   -0.440597 ],
[0.270798  , -0.0545751 ,  0.286587   , -0.0833331 ,  1.0       ,  0.214401 ,   0.195784 , -0.256294 ,  0.236441    , 0.19995    , 0.0553982 ,  0.0660039 ,   0.393351  ],
[0.289101  , -0.335167  ,  0.12898    , -0.321113  ,  0.214401  ,  1.0      ,   0.864564 , -0.449935 ,  0.612413    ,-0.0551364  , 0.433681  ,  0.699949  ,   0.498115  ],
[0.236815  , -0.411007  ,  0.115077   , -0.35137   ,  0.195784  ,  0.864564 ,   1.0      , -0.5379   ,  0.652692    ,-0.172379   , 0.543479  ,  0.787194  ,   0.494193  ],
[-0.155929 ,   0.292977 ,   0.18623   ,   0.361922 ,  -0.256294 ,  -0.449935,   -0.5379  ,   1.0     ,  -0.365845   ,  0.139057  , -0.26264  ,  -0.50327  ,   -0.311385 ],
[0.136698  , -0.220746  ,  0.00965194 , -0.197327  ,  0.236441  ,  0.612413 ,   0.652692 , -0.365845 ,  1.0         ,-0.0252499  , 0.295544  ,  0.519067  ,   0.330417  ],
[0.546364  ,  0.248985  ,  0.258887   ,  0.018732  ,  0.19995   , -0.0551364,  -0.172379 ,  0.139057 , -0.0252499   , 1.0        ,-0.521813  , -0.428815  ,   0.3161    ],
[-0.0717472,  -0.561296 ,  -0.0746669 ,  -0.273955 ,   0.0553982,   0.433681,    0.543479,  -0.26264 ,   0.295544   , -0.521813  ,  1.0      ,   0.565468 ,    0.236183 ],
[0.0723432 , -0.36871   ,  0.00391123 , -0.276769  ,  0.0660039 ,  0.699949 ,   0.787194 , -0.50327  ,  0.519067    ,-0.428815   , 0.565468  ,  1.0       ,   0.312761  ],
[0.64372   , -0.192011  ,  0.223626   , -0.440597  ,  0.393351  ,  0.498115 ,   0.494193 , -0.311385 ,  0.330417    , 0.3161     , 0.236183  ,  0.312761  ,   1.0]]
normwine=np.array(normwine)
print("Running test with Wine data and k = 5")
test(normwine, 5, 3.43978)
print("Running test with Wine data and k = 10")
test(normwine, 10, 4.59429)

Running test with Wine data and k = 5
Found set:  [6, 5, 11, 8, 7]
with value  3.439778719536773
With gap  3.722514890500853e-07
in 48.144259452819824 seconds

Running test with Wine data and k = 10
Found set:  [6, 5, 11, 8, 10, 7, 12, 1, 3, 0]
with value  4.59429342595467
With gap  -7.456983930438847e-07
in 190.97714018821716 seconds



In [74]:
print("Running control with Wine data and k = 5")
test(normwine, 5, 3.43978, method=easy_heuristic)
print("Running control with Wine data and k = 10")
test(normwine, 10, 4.59429, method=easy_heuristic)

Running control with Wine data and k = 5
Found set:  [0, 12, 9, 4, 5]
with value  2.4083398570127583
With gap  0.2998564277329485
in 0.0031309127807617188 seconds

Running control with Wine data and k = 10
Found set:  [0, 12, 9, 4, 5, 6, 11, 8, 7, 10]
with value  4.2555610653666704
With gap  0.07372824411026067
in 0.009728431701660156 seconds



## Pitprops dataset

In [42]:
# This dataset was taken directly from ScalableSPCA.jl
pitprops=[[1,0.954,0.364,0.342,-0.129,0.313,0.496,0.424,0.592,0.545,0.084,-0.019,0.134],
       [0.954,1,0.297,0.284,-0.118,0.291,0.503,0.419,0.648,0.569,0.076,-0.036,0.144],
       [0.364,0.297,1,0.882,-0.148,0.153,-0.029,-0.054,0.125,-0.081,0.162,0.22,0.126],
       [0.342,0.284,0.882,1,0.22,0.381,0.174,-0.059,0.137,-0.014,0.097,0.169,0.015],
       [-0.129,-0.118,-0.148,0.22,1,0.364,0.296,0.004,-0.039,0.037,-0.091,-0.145,-0.208],
       [0.313,0.291,0.153,0.381,0.364,1,0.813,0.09,0.211,0.274,-0.036,0.024,-0.329],
       [0.496,0.503,-0.029,0.174,0.296,0.813,1,0.372,0.465,0.679,-0.113,-0.232,-0.424],
       [0.424,0.419,-0.054,-0.059,0.004,0.09,0.372,1,0.482,0.557,0.061,-0.357,-0.202],
       [0.592,0.648,0.125,0.137,-0.039,0.211,0.465,0.482,1,0.526,0.085,-0.127,-0.076],
       [0.545,0.569,-0.081,-0.014,0.037,0.274,0.679,0.557,0.526,1,-0.319,-0.368,-0.291],
       [0.084,0.076,0.162,0.097,-0.091,-0.036,-0.113,0.061,0.085,-0.319,1,0.029,0.007],
       [-0.019,-0.036,0.22,0.169,-0.145,0.024,-0.232,-0.357,-0.127,-0.368,0.029,1,0.184],
       [0.134,0.144,0.126,0.015,-0.208,-0.329,-0.424,-0.202,-0.076,-0.291,0.007,0.184,1]]
pitprops=np.array(pitprops)
print("Running test with Pitprops data and k = 5")
test(pitprops, 5, 3.406)
print("Running test with Pitprops data and k = 10")
test(pitprops, 10, 4.173)

Running test with Pitprops data and k = 5
0.9999999999999999
Found set:  [1, 0, 8, 9, 6]
with value  3.406154946789761
With gap  -4.5492304686148996e-05
in 0.24170374870300293 seconds
Running test with Pitprops data and k = 10
0.9999999999999999
Found set:  [1, 0, 6, 9, 8, 7, 5, 3, 2, 11]
with value  4.172637661588199
With gap  8.682923838983411e-05
in 1.2246873378753662 seconds


In [75]:
print("Running test with Pitprops data and k = 5")
test(pitprops, 5, 3.406, method=easy_heuristic)
print("Running test with Pitprops data and k = 10")
test(pitprops, 10, 4.173, method=easy_heuristic)

Running test with Pitprops data and k = 5
Found set:  [0, 1, 8, 9, 6]
with value  3.406154946789761
With gap  -4.5492304686148996e-05
in 0.0031507015228271484 seconds

Running test with Pitprops data and k = 10
Found set:  [0, 1, 8, 9, 6, 7, 5, 3, 2, 11]
with value  4.172637661588203
With gap  8.682923838898275e-05
in 0.015695810317993164 seconds



## MiniBooNE dataset

In [6]:
import h5py
f = h5py.File("data/miniBoone.jld", "r")
miniboone = np.array(f['normMiniBooNE'])
print("Running test with MiniBooNE data and k = 5")
test(miniboone, 5, 5.0000)
print("Running test with MiniBooNE data and k = 10")
test(miniboone, 10, 9.9999999)

Running test with MiniBooNE data and k = 5
Found set:  [14, 16, 38, 7, 5]
with value  4.9998734705325365
With gap  2.5305893492699738e-05
in 29087.641620874405 seconds

Running test with MiniBooNE data and k = 10


KeyboardInterrupt: 

In [141]:
miniboone.shape

(50, 50)

In [76]:
print("Running control with MiniBooNE data and k = 5")
test(miniboone, 5, 5.0000, method=easy_heuristic)
print("Running control with MiniBooNE data and k = 10")
test(miniboone, 10, 9.9999999, method=easy_heuristic)

Running control with MiniBooNE data and k = 5
Found set:  [0, 27, 8, 45, 3]
with value  4.9997141848049536
With gap  5.716303900928921e-05
in 0.00829005241394043 seconds

Running control with MiniBooNE data and k = 10
Found set:  [0, 27, 8, 45, 3, 44, 39, 28, 4, 18]
with value  9.999651361951768
With gap  3.485380517178079e-05
in 0.043898582458496094 seconds



## Communities dataset

In [142]:
import h5py
f = h5py.File("data/communities.jld", "r")
communities = np.array(f['normCommunities'])
print("Running test with Communities data and k = 5")
test(communities, 5, 4.86051)
print("Running test with Communities data and k = 10")
test(communities, 10, 8.8236, redund = 30)

Running test with Communities data and k = 5
Found set:  [19, 12, 20, 21, 84]
with value  4.510362131817622
With gap  0.07203932677483996
in 12.135841369628906 seconds

Running test with Communities data and k = 10
Found set:  [12, 19, 20, 21, 84, 83, 85, 82, 79, 80]
with value  8.710459769953587
With gap  0.01282245682560559
in 720.5305912494659 seconds



In [77]:
print("Running control with Communities data and k = 5")
test(communities, 5, 4.86051, method=easy_heuristic)
print("Running control with Communities data and k = 10")
test(communities, 10, 8.8236, redund = 10, method=easy_heuristic)

Running control with Communities data and k = 5
Found set:  [0, 10, 27, 49, 71]
with value  4.62910997419597
With gap  0.04760817811382548
in 0.0324246883392334 seconds

Running control with Communities data and k = 10
Found set:  [0, 10, 27, 49, 71, 89, 51, 90, 96, 99]
with value  7.186827179044622
With gap  0.18549943571279057
in 0.08469438552856445 seconds



## Arrythmia dataset

In [139]:
import h5py
f = h5py.File("data/arrhythmia.jld", "r")
arrythmia = np.array(f['normArrhythmia'])
for i in range(len(arrythmia)):
    for j in range(len(arrythmia[0])):
        if np.isnan(arrythmia[i,j]):
            arrythmia[i,j] = 0
print("Running test with Arrythmia data and k = 5")
test(arrythmia, 5, 4.2321)

Running test with Arrythmia data and k = 5
Found set:  [212, 182, 202, 183, 213]
with value  4.181882024673721
With gap  0.011865970871737259
in 173.6758096218109 seconds



In [140]:
print("Running test with Arrythmia data and k = 10")
# Numerical errors mean that when interpolating, more sampling points are necessary to avoid errors.
test(arrythmia, 10, 7.53938, redund = 30)

Running test with Arrythmia data and k = 10
failed
failed
failed
failed


  x -= np.polyval(p, x) / np.polyval(dpdx, x)
  y = y * x + p[i]


failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
failed
Found set:  [171, 122, 72, 271, 191, 261, 161, 273, 263, 173]
with value  5.735772460467693
With gap  0.23922491498403145
in 8272.326264858246 seconds



In [105]:

print("Running test with Arrythmia data and k = 5")
test(arrythmia, 5, 4.2321, method=easy_heuristic)

Running test with Arrythmia data and k = 5
Found set:  [0, 206, 176, 166, 212]
with value  3.6261694555215427
With gap  0.14317491185899608
in 0.05595827102661133 seconds



In [110]:
print("Running test with Arrythmia data and k = 5")
test(arrythmia, 10, 7.53938, method=easy_heuristic)

Running test with Arrythmia data and k = 5
Found set:  [0, 206, 176, 166, 212, 182, 213, 183, 172, 173]
with value  7.06365571839418
With gap  0.0630985945271123
in 0.14673352241516113 seconds

