# t-Test using Samples drawn from the Error Sampler

In [1]:
import numpy as np
import pandas as pd
import sys
from scipy.spatial import distance

sys.path.append('/home/nico/VSCodeRepos/SigMA')
from NoiseRemoval.xd_special import XDSingleCluster
from miscellaneous.covariance_trafo_sky2gal import transform_covariance_shper2gal
from miscellaneous.error_sampler import ErrorSampler

# Data Generation

In [2]:
# Provided by Sebastian Ratzenböck
data_gaia = pd.read_csv('simulated_data/data_orion_focus.csv')

# Simulate a Gaussian cluster in 6D
cols = ['X', 'Y', 'Z', 'U', 'V', 'W']
mu_true = data_gaia[cols].median().values # is this supposed to be median or mean?
C_true = np.diag([25, 25, 25, 4, 4, 4])
N = 1000

data = np.random.multivariate_normal(mu_true, C_true, N)
df = pd.DataFrame(data, columns=[cols])

cols2match = [
    'ra_error', 'dec_error', 'parallax_error', 'pmra_error', 'pmdec_error', 'radial_velocity_error',
    'ra_dec_corr', 'ra_parallax_corr', 'ra_pmra_corr', 'ra_pmdec_corr', 'dec_parallax_corr', 'dec_pmra_corr', 'dec_pmdec_corr',
    'parallax_pmra_corr', 'parallax_pmdec_corr', 'pmra_pmdec_corr'
]

ra, dec, plx, pmra, pmdec, rv = ErrorSampler().cart2spher(df[cols].values)
df['ra'] = ra
df['dec'] = dec
df['parallax'] = plx
df['pmra'] = pmra
df['pmdec'] = pmdec
df['radial_velocity'] = rv

df[cols2match] = data_gaia[cols2match].sample(n=N, replace=True).values
df.loc[df['radial_velocity_error'].isna().values.ravel(), 'radial_velocity_error'] = 1e3

In [3]:
# first shuffle then split the data
df = df.sample(frac=1).reset_index(drop=True)
df_right = df[:len(df) // 2]
df_left = df[len(df) // 2:]

# Perform Test

In [8]:
# create the two covariance matrices and the mean
def get_xd(data_):
    err_sampler = ErrorSampler(data_)
    err_sampler.build_covariance_matrix()
    # Create sample from errors
    data_new = pd.DataFrame(err_sampler.spher2cart(err_sampler.new_sample()), columns=cols)
    c_vel = ['U', 'V', 'W']
    X = data_new[c_vel]
    C = err_sampler.C[:, 3:, 3:]
    C.shape
    ra, dec, plx, _, _, _ = ErrorSampler().cart2spher(data_new[cols].values)
    # Compute covariance matrix in Galactic coordinates
    C_uvw = transform_covariance_shper2gal(ra, dec, plx, C)     
    xd = XDSingleCluster(max_iter=200, tol=1e-3).fit(X.values, C_uvw)
    return xd, err_sampler

In [12]:
def perform_test(data1, data2):
    _, err_sampler_left = get_xd(data1)
    _, err_sampler_right = get_xd(data2)

    # generate a sample from both distributions
    samples = []
    for err_sampler in [err_sampler_left, err_sampler_right]:
        sampeled_data = err_sampler.spher2cart(err_sampler.new_sample())
        samples.append(sampeled_data[:, [3, 4, 5]])

    return samples


In [13]:
samples = perform_test(df_left, df_right)

In [14]:
from scipy.stats import ttest_ind

ttest_ind(samples[0], samples[1])

Ttest_indResult(statistic=array([-1.63596107, -1.55386979, -1.64634709]), pvalue=array([0.10216304, 0.12053252, 0.10000716]))