# Velocity Test using the Bootstrap

This testing procedure calculates the optimal velocity and then runs a bootstrap comparison on them using a range based test.

In [2]:
import pandas as pd
import numpy as np
import pickle
import sys
from astropy.coordinates import SkyCoord, Galactic
import astropy.units as u
from scipy.stats import mode
from scipy.stats import ttest_ind
from calculate_velocity import calculate_velocity


sys.path.append('/home/nico/VSCodeRepos/SigMA')
from SigMA.SigMA import SigMA

## Simulated Data Loading

In [3]:
# load data, generated for visual data analysis; this also includes the true labels and the weights generated by SigMA
df = pd.read_csv('simulated_data/df.csv', index_col=0)

# load the splits; each key/value pair contains the iteration when the split happens (key) and the cluster labels (values)
splits = pickle.load(open('simulated_data/splits.pkl', 'rb'))

# contains the labels after each split of the data
split_labels = pd.read_csv('simulated_data/labels_iteration.csv', index_col=0)

## Testing Procedure

In [7]:
correct_splits = 0
uvw = []
for iteration, cluster_labels in splits.items():
    print(f'--- Split of clusters {cluster_labels[0], cluster_labels[1]}---')
    # print(iteration, cluster_labels)
    # we need the data of the iteration and the weights
    uvw_cluster = []
    for cluster in cluster_labels[:2]:
        cluster_points = df[split_labels[iteration] == cluster]
        weights = df[split_labels[iteration] == cluster]['weights']
        
        uvw_cluster.append(calculate_velocity(cluster_points, weights))
    uvw.append(uvw_cluster)

    # perform bootstrap testing: 
    # get a bootstrap sample from both clusters
    # then calculate the difference
    # finally, check if the confidence interval contains 0

    velo_difference = []
    for i in range(100):
        bootstrap_sample = []
        for velo in uvw_cluster:
            samples_count = len(velo)
            # get sample
            velo_sample = velo[np.random.choice(np.arange(samples_count), size=50)]
            velo_sample_mean = np.mean(velo_sample, axis=0)
            bootstrap_sample.append(velo_sample_mean)
        velo_difference.append(bootstrap_sample[0] - bootstrap_sample[1])

    confidence_interval = np.percentile(velo_difference, [2.5, 97.5], axis=0)
    # if all three confidence intervals contain 0, we have a splits
    same_velo = True
    for i in range(len(confidence_interval[0])):
        if not confidence_interval[0][i] < 0 and confidence_interval[1][i] > 0:
            same_velo = False

    print(f'Should split: {"True" if cluster_labels[2] else "False"}')
    if same_velo != cluster_labels[2]:
        correct_splits += 1
    print(f'Test indicates split: {same_velo}\n')

print(f'{correct_splits} out of {len(splits)} were performed correctly')

--- Split of clusters (4137, 753)---
Should split: True
Test indicates split: False

--- Split of clusters (753, 445)---
Should split: True
Test indicates split: True

--- Split of clusters (4137, 8447)---
Should split: True
Test indicates split: True

--- Split of clusters (8447, 36243)---
Should split: True
Test indicates split: False

--- Split of clusters (4137, 22941)---
Should split: True
Test indicates split: False

--- Split of clusters (4137, 32730)---
Should split: True
Test indicates split: False

--- Split of clusters (753, 48293)---
Should split: True
Test indicates split: False

--- Split of clusters (4137, 48034)---
Should split: True
Test indicates split: False

--- Split of clusters (8540, 52908)---
Should split: True
Test indicates split: True

--- Split of clusters (4137, 1777)---
Should split: False
Test indicates split: True

--- Split of clusters (4137, 58479)---
Should split: True
Test indicates split: False

--- Split of clusters (8447, 51699)---
Should split: T