In [2]:
import numpy as np
import pandas as pd
from scipy.stats import mode
import sys

sys.path.append('/home/nico/VSCodeRepos/SigMA/')
from SigMA.SigMA import SigMA
from NoiseRemoval.xd_special import XDSingleCluster
from miscellaneous.covariance_trafo_sky2gal import transform_covariance_shper2gal
from miscellaneous.error_sampler import ErrorSampler
from generate_data import generate_data

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plt_colors = [
    '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3',
    '#FF6692', '#B6E880', '#FF97FF', '#FECB52', '#B82E2E', '#316395'
]

def plot_3D_data(data, xyz_titles=['X', 'Y', 'Z'], ax_range=[-40, 40], labels=None):
    fig = make_subplots(
        rows=1, cols=1,
        specs=[[{"type": "scatter3d"}]],
        column_widths=[1],
        subplot_titles=[
            '3D', 
        ],
    )
    # --------------- 3D scatter plot -------------------
    if labels is None:
        trace_3d = go.Scatter3d(
            x=data[:, 0], y=data[:, 1], z=data[:, 2],
            mode='markers',
            marker=dict(size=5, color='red'),
            hoverinfo='none',
            showlegend=False,
        )
        fig.add_trace(trace_3d, row=1, col=1)
    else:
        for l_i in np.unique(labels):
            if l_i != -1:
                trace_3d = go.Scatter3d(
                    x=data.loc[labels==l_i, xyz_titles[0]], y=data.loc[labels==l_i, xyz_titles[1]], z=data.loc[labels==l_i, xyz_titles[2]],
                    mode='markers',
                    marker=dict(size=5, color=plt_colors[l_i%len(plt_colors)]),
                    hoverinfo='none',
                    showlegend=True,
                    name=f'Cluster {l_i}'
                )
                fig.add_trace(trace_3d, row=1, col=1)
    
    # 3d position
    plt_kwargs = dict(showbackground=False, showline=False, zeroline=True, zerolinecolor='grey', zerolinewidth=2, 
                      showgrid=True, showticklabels=True, color='black',
                      linecolor='black', linewidth=1,  gridcolor='rgba(100,100,100,0.5)')

    xaxis=dict(**plt_kwargs, title=xyz_titles[0], range=ax_range)
    yaxis=dict(**plt_kwargs, title=xyz_titles[1], range=ax_range)
    zaxis=dict(**plt_kwargs, title=xyz_titles[2], range=ax_range)

    # Finalize layout
    fig.update_layout(
        title="",
        #width=800,
        #height=800,
        showlegend=True,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        legend= dict(itemsizing='constant', font_color='black'),
        # 3D plot
        scene=dict(
            xaxis=dict(xaxis),
            yaxis=dict(yaxis),
            zaxis=dict(zaxis)
        )
    )
    fig.write_html(f"/home/nico/Desktop/simulated_cluster.html")

In [4]:
data_new, labels_true, df = generate_data()

Negative parallax values encountered, fixing values...


In [13]:
cols2fit = ['X', 'Y', 'Z', 'U', 'V', 'W']
sigma_kwargs = dict(
    cluster_features=cols2fit,
    scale_factors={'pos': {'features': ['U', 'V', 'W'], 'factor': [1, 5, 7]}},
    # These are the default values and should be kept for now
    nb_resampling=0, max_knn_density=101,
    beta=0.99, knn_initcluster_graph=40,
    transform_function=None
)
clusterer = SigMA(data=data_new, **sigma_kwargs).fit(alpha=0.01, knn=15, bh_correction=True)

def produce_baseline(clusterer):
    _, p_values = clusterer.run_sigma(
        alpha=-np.inf, knn=15, return_pvalues=True
    )
    p_values = np.array(p_values)
    pv_sorted = np.sort(p_values[p_values < 0.05])
    # compute mid point between consecutive p-values
    mid_points = (pv_sorted[1:] + pv_sorted[:-1]) / 2
    # Compute vhat for first step
    clusterer.fit(alpha=0.05, knn=15, bh_correction=True)
    return mid_points, clusterer.labels_

mid_points, l0 = produce_baseline(clusterer)

Performing gradient ascend using a 15-NN density estimation.
Updated significance threshold: 7.01e-04


' def produce_baseline(clusterer):\n    _, p_values = clusterer.run_sigma(\n        alpha=-np.inf, knn=15, return_pvalues=True\n    )\n    p_values = np.array(p_values)\n    pv_sorted = np.sort(p_values[p_values < 0.05])\n    # compute mid point between consecutive p-values\n    mid_points = (pv_sorted[1:] + pv_sorted[:-1]) / 2\n    # Compute vhat for first step\n    clusterer.fit(alpha=0.05, knn=15, bh_correction=True)\n    return mid_points, clusterer.labels_\n\nmid_points, l0 = produce_baseline(clusterer) '

In [8]:
clusterer_labels = pd.DataFrame(l0, columns=['iteration_0'])
splits = {}

In [14]:
plot_3D_data(data_new, xyz_titles=['X', 'Y', 'Z'], ax_range=[-500, 500], labels=clusterer.labels_)

In [9]:
iteration = 1
for alpha_i in mid_points:
    # fit clusterer to new alpha
    clusterer.fit(alpha=alpha_i, knn=15, bh_correction=False)
    l_i = clusterer.labels_ 
    new_clusters_id = list(set(l_i) - set(l0))
    clusterer_labels[f'iteration_{iteration}'] = l_i

    if len(new_clusters_id) == 1: 
        # one new cluster was generated
        nc_id = new_clusters_id[0]
        part_of_old_cluster = mode(l0[l_i==nc_id], keepdims=False).mode
        print(f'New cluster: {nc_id} coming from {part_of_old_cluster}')
        splits[f'iteration_{iteration}'] = {
            'new_cluster': nc_id,
            'old_cluster': part_of_old_cluster,
            'alpha': alpha_i
        }
        
    elif len(new_clusters_id) > 1:
        print('More than one new cluster')
    else:
        print('No new cluster')

    l0 = np.copy(l_i)
    iteration += 1
    print(iteration)
    print('-------------------\n')

No new cluster
2
-------------------

No new cluster
3
-------------------

New cluster: 41202 coming from 80
4
-------------------

New cluster: 31949 coming from 41202
5
-------------------

New cluster: 16979 coming from 80
6
-------------------

New cluster: 40497 coming from 16979
7
-------------------

New cluster: 46624 coming from 31949
8
-------------------

New cluster: 10796 coming from 80
9
-------------------

New cluster: 16162 coming from 80
10
-------------------

New cluster: 31710 coming from 41202
11
-------------------

New cluster: 46344 coming from 80
12
-------------------

New cluster: 46336 coming from 80
13
-------------------

New cluster: 40495 coming from 41202
14
-------------------

New cluster: 40978 coming from 41202
15
-------------------

New cluster: 45243 coming from 16979
16
-------------------

New cluster: 50087 coming from 16979
17
-------------------

New cluster: 34157 coming from 40978
18
-------------------

New cluster: 21380 coming from 40

# Data Plotting

In [13]:
labels_true

array([ 0,  0,  0, ..., -1, -1, -1])

In [27]:
splits

{'iteration_3': {'new_cluster': 41202,
  'old_cluster': 80,
  'alpha': 5.11329811825334e-10},
 'iteration_4': {'new_cluster': 31949,
  'old_cluster': 41202,
  'alpha': 4.643816626698216e-05},
 'iteration_5': {'new_cluster': 16979,
  'old_cluster': 80,
  'alpha': 0.0001987753436394346},
 'iteration_6': {'new_cluster': 40497,
  'old_cluster': 16979,
  'alpha': 0.0030160988117030607},
 'iteration_7': {'new_cluster': 46624,
  'old_cluster': 31949,
  'alpha': 0.005852683999629238},
 'iteration_8': {'new_cluster': 10796,
  'old_cluster': 80,
  'alpha': 0.006586604070697222},
 'iteration_9': {'new_cluster': 16162,
  'old_cluster': 80,
  'alpha': 0.00729363315550835},
 'iteration_10': {'new_cluster': 31710,
  'old_cluster': 41202,
  'alpha': 0.007579220372674111},
 'iteration_11': {'new_cluster': 46344,
  'old_cluster': 80,
  'alpha': 0.008155052579305933},
 'iteration_12': {'new_cluster': 46336,
  'old_cluster': 80,
  'alpha': 0.010267512827059955},
 'iteration_13': {'new_cluster': 40495,
  '

In [33]:
import matplotlib.pyplot as plt

def dense_sample(rho):
    """Extract the densest points from the density distribution."""
    mad = np.median(np.abs(rho - np.median(rho)))
    threshold = np.median(rho) * 0.995 + 3 * mad * 1.1
    if np.sum(rho > threshold) < 20:
        threshold = np.percentile(rho, 93)
    return rho > threshold

# plotting the clustered labels according to their true cluster
for split in splits:
    old_cluster = splits[split]['old_cluster']
    new_cluster = splits[split]['new_cluster']

    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    for i, cluster in enumerate([old_cluster, new_cluster]):
        # get the dense core
        rho = clusterer.weights_[clusterer_labels[split] == cluster]
        dense_core = dense_sample(rho)
        # plot the dense core
        labels, counts = np.unique(labels_true[clusterer_labels[split] == cluster][dense_core], return_counts=True)
        ax[i].bar(labels, counts)

    plt.close()
    fig.savefig(f'/home/nico/Desktop/split_{split}.png')

# Get UVW uncertainties

In [11]:
# Get UVW of densest component
c_vel = ['U', 'V', 'W']
X = data_new[c_vel]
C = err_sampler.C[:, 3:, 3:]
C.shape

(1000, 3, 3)

In [12]:
ra, dec, plx, _, _, _ = ErrorSampler().cart2spher(data_new[cols].values)
# Compute covariance matrix in Galactic coordinates
C_uvw = transform_covariance_shper2gal(ra, dec, plx, C) 

In [33]:
%%time
xd = XDSingleCluster(max_iter=200, tol=1e-3).fit(X.values, C_uvw)

CPU times: user 1.11 s, sys: 8.55 ms, total: 1.12 s
Wall time: 386 ms


In [34]:
xd.V

(array([[[ 4.09500036, -0.74012688,  0.12818122],
         [-0.74012688,  4.57595199,  0.34765216],
         [ 0.12818122,  0.34765216,  4.16556263]]]),
 array([[[ 4.06372913, -0.75302132,  0.11959895],
         [-0.75302132,  4.57064204,  0.34411419],
         [ 0.11959895,  0.34411419,  4.16333885]]]),
 array([[4, 0, 0],
        [0, 4, 0],
        [0, 0, 4]]))