In [1]:
import knockpy
import pandas as pd
import numpy as np
import maldImportance

from os import path

rng: np.random.Generator = np.random.default_rng()
    
# Parameters
n: int = 1024 # Sample Size
p_numeric: int = 16 # Number of Variables
p_categorical: int = 16 # Number of Categorical Variables
rho: float = 0.5 # AR(1) Correlation


In [2]:
# -- Data Generation
# Use knockpy's Data Generating Process (dgp) to make the covariance matrix
# Read from disk if it is already there
X: pd.DataFrame
X_path: str = path.join(
    'data','examples','X_0.csv'
)

if not path.isfile( X_path ):
    Sigma: np.ndarray = knockpy.dgp.AR1( p = p_numeric, rho = rho)
    mu: np.ndarray = np.zeros( (p_numeric,), dtype = float ) # Mean 0 Data

    # Create Gaussian X Data
    X_numeric = pd.DataFrame(
        rng.multivariate_normal(
            mean = mu,
            cov = Sigma,
            size = (n,)
        )
    )
    
    # TODO: Categorical Data
    # Make a series of categorical data; each is the logit of two numeric X:
    #  logit( X_categorical[:,j] ) = 0.5*( X[:,j] + X[:,j+1 mod p])
    def _make_categorical( row ):
        assert len( row ) == p_numeric
        row_categorical: list[ int ] = [ 0 ] * p_categorical
        
        for j in range( p_categorical ):
            log_odds: float = 0.5*( row[j] + row[ j%p_numeric ] )
            odds: float = np.exp( log_odds )
            _p: float = odds/(odds + 1)
            _q: float = 1/( odds + 1 )
            row_categorical[ j ] = rng.choice(
                [0,1], p = ( _q, _p )
            )
        #
        return row_categorical
    #
    
    X_categorical_rows = [
        _make_categorical( X_numeric.iloc[i,:] ) for i in range( X_numeric.shape[0] )
    ]
    X_categorical: pd.DataFrame = pd.DataFrame(
        X_categorical_rows, dtype = 'category'
    )
        
    X: pd.DataFrame = pd.concat(
        [X_numeric, X_categorical],
        axis = 1, ignore_index = True
    )

    
    X.to_csv( X_path, index = False )
    
    del X
#/if not path.isfile( X_path )


X: pd.DataFrame = pd.read_csv(
    X_path
)
# Leave X as numeric, since it's already encoded one-hot for categories

Xk_path: str = path.join(
    'data','examples','Xk_0.csv'
)

if not path.isfile( Xk_path ):
    # Create Knockoff X Data with the second order method
    knockoffSampler: knockpy.knockoffs.KnockoffSampler = knockpy.knockoffs.GaussianSampler(
        X = X.to_numpy(),
        choldate_warning = False
    )

    Xk: pd.DataFrame = pd.DataFrame(
        knockoffSampler.sample_knockoffs()
    )

    # Quantize the categories
    for j in range( p_numeric, p_numeric + p_categorical ):
        Xk.iloc[:,j] = (Xk.iloc[:,j] >= 0.5).astype( int )
    #
    Xk = Xk.astype(
        {
            Xk.columns[j]: float  for j in range( p_numeric )
        } | {
            Xk.columns[j]: int for j in range( p_numeric, p_numeric + p_categorical )
        }
    )
    
    Xk.to_csv( Xk_path, index = False )
    del Xk
#

Xk: pd.DataFrame = pd.read_csv( Xk_path )


# Create y data with variables 0,4,8,12
# We have coefficients on the front of each term to make their effects on E[y] similar
# Every term is an even function of each X[:,j], which is marginally Normal(0,1),
#   so the net linear effect should be 0

y_path: str = path.join(
    'data', 'examples', 'y_0.csv'
)
relevant_indices: list[ int ]
first_layer_width: int
beta: float
if not path.isfile( y_path ):
    from math import pi
    beta = 1.0 # = 32/sqrt(n)
    if p_numeric + p_categorical == 16:
        y: pd.Series = \
            beta*(
                X.iloc[:,0]\
                + 1.42*np.cos( X.iloc[:,3]*2*pi )\
                - 2.86*np.sqrt( np.absolute(X.iloc[:,6]) )\
                + X.iloc[:,8]\
                - X.iloc[:,12]
            )\
            + rng.normal(
                loc = 0.0,
                scale = 1.0,
                size = (n,)
            )
        relevant_indices = [ 0,3,6,8,12 ]
        first_layer_width = 12
    #
    elif p_numeric + p_categorical == 32:
        y: pd.Series = \
            beta*(
                X.iloc[:,0]\
                + 1.42*np.cos( X.iloc[:,4]*2*pi )\
                - 2.86*np.sqrt( np.absolute(X.iloc[:,8]) )\
                + 0.7*X.iloc[:,12]**2\
                - X.iloc[:,16]\
                + X.iloc[:,22]\
                - X.iloc[:,28]
            )\
            + rng.normal(
                loc = 0.0,
                scale = 1.0,
                size = (n,)
            )
        relevant_indices = [ 0,4,8,12,16,22,28 ]
        first_layer_width = 24
    #
    else:
        raise Exception(
            "Unrecognized p_numeric={}, p_categorical={}".format(p_numeric, p_categorical)
        )
    #
    y.to_csv( y_path, index = False )
    del y
else:
    relevant_indices = [ 0,4,8,12,16,22,28 ]
    first_layer_width = 24
#/if not path.isfile( y_path )

print("# relevant_indices:")
print( relevant_indices )

y: pd.Series = pd.read_csv( y_path )
print("# y:")
print( y )

# Convert X, Xk to categories so we can appropriately use MALD importance
X = X.astype(
    {
        X.columns[j]: float  for j in range( p_numeric )
    } | {
        X.columns[j]: 'category' for j in range( p_numeric, p_numeric + p_categorical )
    }
)
Xk = Xk.astype(
    {
        Xk.columns[j]: float  for j in range( p_numeric )
    } | {
        Xk.columns[j]: 'category' for j in range( p_numeric, p_numeric + p_categorical )
    }
)

# relevant_indices:
[0, 4, 8, 12, 16, 22, 28]
# y:
             0
0    -3.504477
1    -4.214169
2    -4.810606
3    -6.091500
4    -1.421410
...        ...
1019 -5.893012
1020 -3.233596
1021 -4.391070
1022 -2.879617
1023 -2.845034

[1024 rows x 1 columns]


In [3]:
# Calculate MALD Importances
importances: np.ndarray = maldImportance.nnImportance.importances(
    X = X,
    Xk = Xk,
    y = y,
    local_grad_method = 'auto_diff',
    exponent = 2.0,
    epochs = 200,
    dense_activation = 'relu',
    first_layer_width = first_layer_width,
    layers = 2,
    layer_shrink_factor = 0.75,
    learning_rate = 0.05
)

2025-04-29 21:13:47.885029: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 933us/step - loss: 7.6424
Epoch 2/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 842us/step - loss: 3.9848
Epoch 3/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 811us/step - loss: 3.4818
Epoch 4/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 741us/step - loss: 3.6207
Epoch 5/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - loss: 2.7686
Epoch 6/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 679us/step - loss: 2.9041
Epoch 7/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 754us/step - loss: 2.4176
Epoch 8/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 752us/step - loss: 2.1115
Epoch 9/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 628us/step - loss: 2.2280
Epoch 10/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 633us

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 766us/step - loss: 0.8295
Epoch 81/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 641us/step - loss: 0.8104
Epoch 82/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 709us/step - loss: 0.9151
Epoch 83/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 707us/step - loss: 0.8984
Epoch 84/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 642us/step - loss: 0.9500
Epoch 85/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step - loss: 0.8503
Epoch 86/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 711us/step - loss: 0.8570
Epoch 87/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 719us/step - loss: 0.9506
Epoch 88/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 645us/step - loss: 0.9135
Epoch 89/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 641us/ste

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 694us/step - loss: 0.8255
Epoch 160/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 666us/step - loss: 0.7418
Epoch 161/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 667us/step - loss: 0.6977
Epoch 162/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 735us/step - loss: 0.7070
Epoch 163/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 655us/step - loss: 0.7192
Epoch 164/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 630us/step - loss: 0.6299
Epoch 165/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 649us/step - loss: 0.7492
Epoch 166/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - loss: 0.7021
Epoch 167/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 644us/step - loss: 0.6712
Epoch 168/200
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 611us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 593us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 579us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 574us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 600us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 593us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 620us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 595us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 605us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 579us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 613us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 591us/step
[1m32/32[0m [32m━━━━━━

In [6]:
# Calculate W Statistics and perform knockoff filter
W: np.ndarray = maldImportance.importance.wFromImportances(
    importances
)

print("MALD (X,Xk) => y:")
print( W )

# Perform variable selection

fdr_target: float = 0.2
threshold: float = knockpy.knockoff_stats.data_dependent_threshhold(
    W = W,
    fdr = fdr_target
)
    
selected: np.ndarray = ( W >= threshold )

power: float = np.sum( selected[ relevant_indices] )/len( relevant_indices )
fdr: float
if np.sum( selected ) <= 0:
    fdr = 0.0
#
else:
    fdr = ( np.sum( selected ) - np.sum( selected[relevant_indices] ) )/np.sum( selected )
#

print(
    "Selected = {}\nPower = {}\nfdr = {} (fdr_target = {})".format(
        int( np.sum( selected ) ),
        power,
        fdr,
        fdr_target
    )
)

MALD (X,Xk) => y:
[ 1.51510243  0.13195295 -0.09713495  0.16261446 -0.1203826   0.03651478
  0.21597201 -0.27521882  2.97319607 -0.25843681 -0.07061121  0.1854294
  1.47783754  0.11737907 -0.28900017  0.11593868  1.38019011 -0.13774325
  0.50665448 -0.16988017  0.00944868 -0.33528335  0.8973365  -0.01619484
 -0.02928708  0.1446139  -0.16588513  0.22647415  0.88356189 -0.28881973
  0.17967259 -0.48294741]
Selected = 7
Power = 0.8571428571428571
fdr = 0.14285714285714285 (fdr_target = 0.2)


In [5]:
# Compare with LASSO
from sklearn.linear_model import LassoCV

lasso_model: LassoCV = LassoCV( max_iter = 1000 )
lasso_model.fit(
    X = pd.concat( [X, Xk], axis = 1, ignore_index=True ),
    y = y.to_numpy().reshape( (n,) )
)

if True:
    lasso_coefficients: np.ndarray = lasso_model.coef_
    W_lasso: np.ndarray = np.abs(
        lasso_coefficients[:p_numeric+p_categorical]
    ) - np.abs(
        lasso_coefficients[p_numeric+p_categorical:]
    )

    print( "coefficient method:")
    print( W_lasso )
    del lasso_coefficients
    del W_lasso
#

lasso_coefficients: np.ndarray = maldImportance.importance.importancesFromModel(
    model = lasso_model,
    X = X,
    Xk = Xk,
    y = y.to_numpy(),
    local_grad_method = 'bandwidth',
    bandwidth = 0.5,
    exponent = 1
)
W_lasso: np.ndarray = np.abs(
    lasso_coefficients[:p_numeric+p_categorical]
) - np.abs(
    lasso_coefficients[p_numeric+p_categorical:]
)

    
threshold_lasso: float = knockpy.knockoff_stats.data_dependent_threshhold(
    W = W_lasso,
    fdr = fdr_target
)
    
selected_lasso: np.ndarray = ( W_lasso >= threshold )
power_lasso: float = np.sum( selected_lasso[ relevant_indices] )/len( relevant_indices )
fdr_lasso: float
if np.sum( selected_lasso ) <= 0:
    fdr_lasso = 0.0
#
else:
    fdr_lasso = ( np.sum( selected_lasso ) - np.sum( selected_lasso[relevant_indices] ) )/np.sum( selected_lasso )
#

print( "mald method")
print( W_lasso )
print(
    "Selected = {}, Power = {}, fdr = {} (fdr_target={})".format(
        int( np.sum( selected_lasso ) ),
        power_lasso,
        fdr_lasso,
        fdr_target
    )
)

coefficient method:
[ 0.81810977  0.          0.00273593  0.         -0.04396628  0.
 -0.00200645  0.          0.          0.01079832  0.          0.
  0.03149096  0.          0.01570555  0.03038764  0.69021387  0.
 -0.0170775   0.          0.          0.          0.97487757  0.
  0.          0.00912649  0.         -0.05963511  0.66833067  0.
  0.         -0.05588189]
mald method
[ 0.81810977  0.          0.00273593  0.         -0.04396628  0.
 -0.00200645  0.          0.          0.01079832  0.          0.
  0.03149096  0.          0.01570555  0.03038764  0.69021387  0.
 -0.0170775   0.          0.          0.          0.97487757  0.
  0.          0.00912649  0.         -0.05963511  0.66833067  0.
  0.         -0.05588189]
Selected = 4, Power = 0.5714285714285714, fdr = 0.0 (fdr_target=0.2)
