In [1]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from dcorr import DCorr
from power import power
from mgcpy.benchmarks.simulations import *
from mgc import MGC
from rv_corr import RVCorr
import pickle
from joblib import Parallel, delayed
import multiprocessing as mp
from hhg import HHG
#from mgcpy.independence_tests.kendall_spearman import KendallSpearman

In [2]:
def compute_distance_matrix(data_matrix_X, data_matrix_Y):
    # obtain the pairwise distance matrix for X and Y
    dist_mtx_X = squareform(pdist(data_matrix_X, metric='euclidean'))
    dist_mtx_Y = squareform(pdist(data_matrix_Y, metric='euclidean'))
    return (dist_mtx_X, dist_mtx_Y)

In [3]:
def power_vs_sample_size(independence_test, simulation_type, sim_name):
    '''
    Generate power of an independence test given a simulation for a range of sample size
    range is [5, 10, 15, ..., 95, 100]
    
    :param independence_test: an independence_test object
    :param simulation_type: a simulation function from mgcpy.benchmarks.simulations
    :return: power for each sample size
    '''
    sample_sizes = [i for i in range(5, 101, 5)]
    estimated_power = np.zeros(len(sample_sizes))
    
    for i, size in enumerate(sample_sizes):
        estimated_power[i] = power(independence_test, simulation_type, num_samples=size, theta=0, num_dimensions=2, noise=1)
    return estimated_power

In [4]:
def compute_all_power_vs_sample_size(simulation_type, sim_name):
    '''
    Compute power for each independence test for each dimension in a specified range 
    '''

    '''
    Initialize all independence test objects
    Data matrices initialized to np.nan, because in power computation each repeats involves generating new samples
    '''
    mcorr = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='mcorr')
    dcorr = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='dcorr')
    mantel = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='mantel')
    mgc = MGC(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    hhg = HHG(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    pearson = RVCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix, which_test='pearson')
    
    
    independence_tests =  {'MGC': mgc, 'MCorr': mcorr, 'DCorr': dcorr, 'Mantel': mantel}
                          #'Pearson': pearson} #, 'HHG': hhg}
    power_results = {}
    
    
    # compute power for each test for each dimension
    for name, test in independence_tests.items():
        power = power_vs_sample_size(test, simulation_type, sim_name)
        power_results[name] = power
        print('{} finished'.format(name))
    
    return power_results

In [5]:
simulations = {'linear':linear_sim,'exponential':exp_sim,'cubic':cub_sim,'joint_normal':joint_sim, 'sine_4pi': sin_sim, 'sine_16pi': sin_sim, 'multi_noise': multi_noise_sim,
               'step': step_sim, 'spiral': spiral_sim, 'circle': circle_sim, 'ellipse': circle_sim, 'diamond': square_sim,
               'log': log_sim, 'quadratic': quad_sim, 'w_shape': w_sim, 'two_parabolas': two_parab_sim, 'fourth_root': root_sim,
               'multi_indept': multi_indep_sim}

In [6]:
for sim_name, sim_func in simulations.items():
    power_results = compute_all_power_vs_sample_size(sim_func, sim_name)
    file = open('power_curve_sample_size_{}.pkl'.format(sim_name), 'wb')
    pickle.dump(power_results, file)
    file.close()
    print('{} finished'.format(sim_name))

MGC finished
MCorr finished
DCorr finished
Mantel finished
linear finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
exponential finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
cubic finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
joint_normal finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
sine_4pi finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
sine_16pi finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
multi_noise finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
step finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
spiral finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
circle finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
ellipse finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
diamond finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
log finished
MGC fini

In [7]:
def power_vs_angle(independence_test, simulation_type, test_range):
    '''
    Generate power of an independence test given a simulation for a range of dimensions
    
    :param independence_test: an independence_test object
    :param simulation_type: a simulation function from mgcpy.benchmarks.simulations
    :param dim_range: the upper end of the range of dimension
    :return: power for each dimension
    '''
    estimated_power = np.zeros(test_range+1)
    w=test_range
    for t in range(0, test_range+1):
        estimated_power[t] = power(independence_test, simulation_type, num_samples=100, theta=t*(180/w))
    return estimated_power

In [8]:
def compute_all_power_vs_angle(simulation_type, sim_name):
    '''
    Compute power for each independence test for each dimension in a specified range 
    '''

    '''
    Initialize all independence test objects
    Data matrices initialized to np.nan, because in power computation each repeats involves generating new samples
    '''
    mcorr = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='mcorr')
    dcorr = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='dcorr')
    mantel = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='mantel')
    mgc = MGC(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    rv_corr = RVCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    hhg = HHG(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    cca = RVCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix, which_test='cca')
    
    
    independence_tests = {'MGC': mgc, 'MCorr': mcorr, 'DCorr': dcorr, 'Mantel': mantel,
                          'RV Corr': rv_corr, 'CCA': cca} #, 'HHG': hhg}
    power_results = {}
    
    
    # compute power for each test for each dimension
    for name, test in independence_tests.items():
        power = power_vs_angle(test, simulation_type,20)
        power_results[name] = power
        print('{} finished'.format(name))
    
    #independence_tests = [mcorr, dcorr]
    #power_results = Parallel(n_jobs=2)(iter[delayed(power_vs_dimension)(test, simulation_type, dim_range) for test in independence_tests])
    
    
    return power_results

In [9]:
for sim_name, sim_func in simulations.items():
    power_results = compute_all_power_vs_angle(sim_func, sim_name)
    pickle.dump(power_results, open('power_curve_angle_{}.pkl'.format(sim_name), 'wb'))
    print('{} finished'.format(sim_name))

MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
linear finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
exponential finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
cubic finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
joint_normal finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
sine_4pi finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
sine_16pi finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
multi_noise finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
step finished
MGC finished
MCorr finished
DCorr finished
Mantel finished
RV Corr finished
CCA finished
spiral finished
MGC finished
MCorr finished
DCorr fin

In [10]:
def compute_all_power(simulation_type, test_range, param):
    '''
    Compute power for each independence test for each dimension in a specified range 
    '''

    '''
    Initialize all independence test objects
    Data matrices initialized to np.nan, because in power computation each repeats involves generating new samples
    '''
    mcorr = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='mcorr')
    dcorr = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='dcorr')
    mantel = DCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan,
                  compute_distance_matrix=compute_distance_matrix, corr_type='mantel')
    mgc = MGC(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    rv_corr = RVCorr(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    #hhg = HHG(data_matrix_X=np.nan, data_matrix_Y=np.nan, compute_distance_matrix=compute_distance_matrix)
    
    independence_tests = {'MGC': mgc, 'MCorr': mcorr, 'DCorr': dcorr, 'Mantel': mantel, 'RV Corr': rv_corr} 
                          #'HHG': hhg, 'Kendall Spearman': ks}
    power_results = {}
    
    # compute power for each test for each dimension
    for name, test in independence_tests.items():
        if param=='angle':
            power = power_vs_angle(test, simulation_type, test_range)
            power_results[name] = power
            print('{} finished'.format(name))
        elif param=='size':
            power = power_vs_samplesize(test, simulation_type, test_range)
            power_results[name] = power
            print('{} finished'.format(name))
    
    return power_results

In [11]:
def plot_power_curve_angle(power_results, simulation_name):
    '''
    Plot the power of each independence test relative to mgc
    '''
    mgc_power = power_results['MGC']

    for name, power in power_results.items():     
        plt.plot(power-mgc_power, label=name)
   
    plt.xlabel('Angle')
    plt.ylabel('Power')
    plt.legend()
    plt.gca().set_xlim(1, mgc_power.shape[0]+1)
    plt.xticks(np.arange(1, mgc_power.shape[0]+1))
    plt.title('Power \n Data: {} Simulation, 100 samples, noise=0'.format(simulation_name))
    plt.show()

In [12]:
def plot_power_curve_size(power_results, simulation_name):
    '''
    Plot the power of each independence test relative to mgc
    '''
    mgc_power = power_results['MGC']

    for name, power in power_results.items():     
        plt.plot(power-mgc_power, label=name)
   
    plt.xlabel('Sample Size')
    plt.ylabel('Power')
    plt.legend()
    plt.gca().set_xlim(1, mgc_power.shape[0]+1)
    plt.xticks(np.arange(1, mgc_power.shape[0]+1))
    plt.title('Power \n Data: {} Simulation, theta=0, noise=0'.format(simulation_name))
    plt.show()

In [13]:
def plot_power_curve_sample_size(power_results, simulation_name):
    '''
    Plot the power of each independence test relative to mgc
    '''
    mgc_power = power_results['MGC']
    sample_sizes = [i for i in range(5, 101, 5)]
    fig,ax = plt.subplots()

    for name, power in power_results.items():     
        ax.plot(sample_sizes, power-mgc_power, label=name)
    
    #ax.set_xticklabels([20])
    plt.xlabel('Sample Size')
    plt.ylabel('Power Relative to MGC')
    plt.legend()
    plt.gca().set_ylim(-1, 1)
    #plt.xticks(np.arange(1, mgc_power.shape[0]+1))
    plt.title('Power \n Data: {} Simulation'.format(simulation_name))
    #plt.show()

In [14]:
def plot_all_curves(which_type):
    simulation_names = ['linear', 'exponential', 'cubic','joint_normal', 'step', 'quadratic', 'w_shape', 'spiral', 'log', 'fourth_root', 'sine_4pi',
                       'sine_16pi', 'two_parabolas', 'circle', 'ellipse', 'diamond', 'multi_noise',
                        'multi_indept']
    
    fig, ax = plt.subplots(nrows=3, ncols=6, figsize=(14,12))  
    index = 0
    for i, row in enumerate(ax):
        for j, col in enumerate(row):
            sim_name = simulation_names[index]
            index += 1
            if which_type == 'angle':
                if i == 0 and (j == 0 or j == 1 or j == 2):
                    col.set_ylim(-1, 1)
                    col.set_yticks([])
                    col.set_title(sim_name)
                    continue
                file = open('power_curve_angle_{}.pkl'.format(sim_name),  'rb')
            else:
                file = open('power_curve_sample_size_{}.pkl'.format(sim_name),  'rb')

            power_results = pickle.load(file)
            file.close()
            mgc_power = power_results['MGC']
            if which_type == 'angle':
                x_axis = np.arange(1, mgc_power.shape[0]+1)
            else:
                x_axis = [i for i in range(5, 101, 5)]

            for name, power in power_results.items():
                col.plot(x_axis, power-mgc_power, label=name)
                col.set_ylim(-1, 1)
                col.set_yticks([])
                col.set_title(sim_name)
    plt.legend()
    plt.subplots_adjust(hspace=.75)
    if which_type == 'angle':
        fig.suptitle('Power Relative to MGC of Varying Angles')
        plt.savefig('all_curves_angle')
    else:
        fig.suptitle('Power Relative to MGC of for 20 Simulated 1-Dimensional Settings')
        plt.savefig('all_curves_sample_size')

In [15]:
plot_all_curves('angle')