In [1]:
import numpy as np
import pyGPs
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel, laplacian_kernel

In [80]:
def compute_gp_regression(X_train, y_train, X_test):
    model = pyGPs.GPR()
    m = pyGPs.mean.Const()
    k = pyGPs.cov.RBF()
    model.setPrior(mean=m, kernel=k)
    model.optimize(X_train, y_train)
    print('Optimized negative log marginal likelihood:', round(model.nlZ,3))
    y_pred, _, _, _, _ = model.predict(X_test)
    return y_pred

def HSIC_d(X, Y, kernel='exponential'):
    n = len(X)

    if kernel == 'exponential':
        apply_kernel = rbf_kernel
    elif kernel == 'laplacian':
        apply_kernel = laplacian_kernel
    K = apply_kernel(X.reshape(-1, 1))
    L = apply_kernel(Y.reshape(-1, 1))
    
    H = np.eye(n) - np.ones((n, n)) * (1.0 / n)
    return ((n - 1) ** -2) * np.trace(np.dot(np.dot(np.dot(K, H), L), H))


def ANM_algorithm(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    leakage_prob = dict()
    
    for col in range(X_train.shape[1]):
        
        x_train_column = X_train[:,col]
        x_test_column = X_test[:,col]
        
        print x_train_column.shape, y_train.shape, x_test_column.shape
        
        y_pred = compute_gp_regression(x_train_column, y_train, x_test_column)
        x_pred = compute_gp_regression(y_train, x_train_column, y_test)
        print 'y_pred shape', y_pred.shape
        print 'x_pred shape', x_pred.shape
        
        y_residuals = y_test - y_pred.ravel() # esto no deberia ser absolute value?
        x_residuals = x_test_column - x_pred.ravel()
        
        print y_residuals.shape
        print x_test_column.shape
        
        print x_residuals.shape
        print y_test.shape
        
        
        HSIC_x_to_y = HSIC_d(x_test_column, y_residuals)
        HSIC_y_to_x = HSIC_d(y_test, x_residuals)
        
        
        diff_HSIC = HSIC_x_to_y - HSIC_y_to_x
    
        
        leakage_prob[diff_HSIC] = col
    
    keys = leakage_prob.keys()
    keys.sort(reverse=True)
    for key in keys:
        print "The probability of column: " + str(leakage_prob[key]) + " is: " + str(key)
        
    
    

In [60]:
import pandas as pd
pairs = pd.read_csv('data/pair0039.txt', sep=' ', header=None)
pairs.columns = ['X', 'Y']

In [68]:
x = np.array(pairs)[:,0].reshape(-1,1)
y = np.array(pairs)[:,1]

In [69]:
ANM_algorithm(x,y)

Number of line searches 40
('Optimized negative log marginal likelihood:', 2435.659)
Number of line searches 40
('Optimized negative log marginal likelihood:', 2435.659)
y_pred shape (131, 1)
x_pred shape (131, 1)
(131,)
(131,)
(131,)
(131,)
(131,)
The probability of column: 0 is: -0.00114255125667


In [51]:
from sklearn.datasets import load_boston
boston = load_boston()

In [91]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], 
      dtype='|S7')

In [90]:
boston.DESCR



In [74]:
X = boston.data
y = boston.target

In [75]:
X.shape

(506, 13)

In [76]:
y.shape

(506,)

In [81]:
ANM_algorithm(X, y)

(339,) (339,) (167,)
Number of line searches 40
('Optimized negative log marginal likelihood:', 1208.048)
Number of line searches 40
('Optimized negative log marginal likelihood:', 1111.66)
y_pred shape (167, 1)
x_pred shape (167, 1)
(167,)
(167,)
(167,)
(167,)
(339,) (339,) (167,)
Number of line searches 40
('Optimized negative log marginal likelihood:', 1227.439)
Number of line searches 40
('Optimized negative log marginal likelihood:', 1510.418)
y_pred shape (167, 1)
x_pred shape (167, 1)
(167,)
(167,)
(167,)
(167,)
(339,) (339,) (167,)
Number of line searches 40
('Optimized negative log marginal likelihood:', 1189.604)
Number of line searches 40
('Optimized negative log marginal likelihood:', 1067.432)
y_pred shape (167, 1)
x_pred shape (167, 1)
(167,)
(167,)
(167,)
(167,)
(339,) (339,) (167,)
Number of line searches 40
('Optimized negative log marginal likelihood:', 1236.236)
Number of line searches 40
('Optimized negative log marginal likelihood:', 25.543)
y_pred shape (167, 1)
x

In [85]:
X[:,12]

array([  4.98,   9.14,   4.03,   2.94,   5.33,   5.21,  12.43,  19.15,
        29.93,  17.1 ,  20.45,  13.27,  15.71,   8.26,  10.26,   8.47,
         6.58,  14.67,  11.69,  11.28,  21.02,  13.83,  18.72,  19.88,
        16.3 ,  16.51,  14.81,  17.28,  12.8 ,  11.98,  22.6 ,  13.04,
        27.71,  18.35,  20.34,   9.68,  11.41,   8.77,  10.13,   4.32,
         1.98,   4.84,   5.81,   7.44,   9.55,  10.21,  14.15,  18.8 ,
        30.81,  16.2 ,  13.45,   9.43,   5.28,   8.43,  14.8 ,   4.81,
         5.77,   3.95,   6.86,   9.22,  13.15,  14.44,   6.73,   9.5 ,
         8.05,   4.67,  10.24,   8.1 ,  13.09,   8.79,   6.72,   9.88,
         5.52,   7.54,   6.78,   8.94,  11.97,  10.27,  12.34,   9.1 ,
         5.29,   7.22,   6.72,   7.51,   9.62,   6.53,  12.86,   8.44,
         5.5 ,   5.7 ,   8.81,   8.2 ,   8.16,   6.21,  10.59,   6.65,
        11.34,   4.21,   3.57,   6.19,   9.42,   7.67,  10.63,  13.44,
        12.33,  16.47,  18.66,  14.09,  12.27,  15.55,  13.  ,  10.16,
      