# Virgo Demo 3 - Advanced pipeline

In [None]:
from virgo.cluster import VirgoCluster
from virgo.kernel import VirgoKernel, VirgoSimpleKernel
from virgo.mixture import VirgoMixture, VirgoClustering
from virgo.cleaner import LowDensityCleaner

%load_ext autoreload
%autoreload 2

%matplotlib notebook

### Data class

VirgoCluster is meant to be the base class for data handling. It stores separately raw data, the rescaled data set and the final cluster and cluster_label arrays.The rescaled data set is created of the scale_data() class method is called. print_datastats() prints a few helper info about the stored datasets. 

In [None]:
file_name = "/home/max/Software/virgo/data/data.txt"
virgo_cluster = VirgoCluster(file_name=file_name)
virgo_cluster.scale_data()
virgo_cluster.print_datastats()

### Kernel

Virgo uses a covariance function to create additional feature space dimensions by leveraging correlations in the datasets itself. For the time being this is a very simple LinearKernel. VirgoKernel needs to be instantiated with the corresponding VirgoCluster object and then just called. The new feature dimensions are added to the rescaled data set automatically, as can be seen from the stats output.

Currently, only the spatial dimensions are used for the kernel. Dimensions to use can be passed as list.

In [None]:
# virgo_kernel = VirgoKernel(virgo_cluster, spatial_dim=[0, 1, 2, 3, 4, 5], add_dim_back=6)
# virgo_kernel = VirgoKernel(virgo_cluster, spatial_dim=[0, 1, 2, 3, 4, 5, 6])
# virgo_kernel = VirgoKernel(virgo_cluster, add_dim_back=6)
# virgo_kernel = VirgoKernel(virgo_cluster)
virgo_kernel = VirgoSimpleKernel(virgo_cluster)
virgo_kernel()
virgo_cluster.print_datastats()

In [None]:
# virgo_mixture = VirgoMixture(virgo_cluster, n_comp=12, mixture_type="bayesian_gaussian")
# virgo_mixture = VirgoMixture(virgo_cluster, n_comp=12, fit_dim_ind=[0, 1, 2, 3, 4, 5])
virgo_mixture = VirgoMixture(virgo_cluster, n_comp=12)
elbo = virgo_mixture.fit()

print(f"ELBO: {elbo}")
print(f"Mixture weights {virgo_mixture.model.weights_}")

virgo_mixture.predict(remove_uncertain_labels=True)
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

virgo_cluster.plot_cluster(n_step=25, plot_kernel_space=True)
virgo_cluster.plot_cluster(n_step=25, store_gif=True)

In [None]:
for i in virgo_cluster.get_labels()[1:]:
    print(f"Label {i}, Counts {virgo_cluster.get_labels(return_counts=True)[1][i + 1]}")
    virgo_cluster.plot_cluster(n_step=10, cluster_label=[i])

In [None]:
d_cleaner = LowDensityCleaner(virgo_cluster, 1e-8)
d_cleaner.clean()
print(virgo_cluster.get_labels(return_counts=True))
virgo_cluster.plot_cluster(n_step=25)

In [None]:
virgo_cluster.plot_cluster(n_step=50, cluster_label=[0, 1, 2, 3, 4])

In [None]:
# virgo_cluster.export_cluster("vc_cleaned", remove_uncertain=True, remove_evno=True)

### After removing low density, start from the top

Should help with less spread and noisy covar

In [None]:
# file_name = "/home/max/Software/virgo/vc_cleaned_cluster.txt"
file_name = "/home/max/Software/virgo/data/data.txt"
virgo_cluster = VirgoCluster(file_name=file_name, n_max_data=200000)
virgo_cluster.scale_data()
virgo_cluster.print_datastats()

In [None]:
# virgo_kernel = VirgoKernel(virgo_cluster, spatial_dim=[0, 1, 2, 3, 4, 5], add_dim_back=6)
# virgo_kernel = VirgoKernel(virgo_cluster, spatial_dim=[0, 1, 2])
# virgo_kernel = VirgoKernel(virgo_cluster, add_dim_back=6)
virgo_kernel = VirgoKernel(virgo_cluster)
virgo_kernel()
virgo_cluster.print_datastats()

In [None]:
# virgo_mixture = VirgoMixture(virgo_cluster, n_comp=25, mixture_type="bayesian_gaussian")
# virgo_mixture = VirgoMixture(virgo_cluster, n_comp=10, fit_dim_ind=[0, 1, 2, 3, 4, 5])
virgo_mixture = VirgoMixture(virgo_cluster, n_comp=12)
elbo = virgo_mixture.fit()

print(f"ELBO: {elbo}")
print(f"Mixture weights {virgo_mixture.model.weights_}")

virgo_mixture.predict(remove_uncertain_labels=True)
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

virgo_cluster.plot_cluster(n_step=20, plot_kernel_space=True, store_gif=True)
virgo_cluster.plot_cluster(n_step=20)

In [None]:
for i in virgo_cluster.get_labels()[1:]:
    print(f"Label {i}, Counts {virgo_cluster.get_labels(return_counts=True)[1][i + 1]}")
    virgo_cluster.plot_cluster(n_step=10, cluster_label=[i])

In [None]:
# virgo_mixture = VirgoMixture(virgo_cluster, n_comp=10, mixture_type="bayesian_gaussian") # verbos
# virgo_mixture = VirgoMixture(virgo_cluster, n_comp=10, fit_dim_ind=[0, 1, 2, 3, 4, 5])
virgo_clustering = VirgoClustering(virgo_cluster, n_clusters=12, clustering_type="optics")
virgo_clustering.predict()
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

virgo_cluster.plot_cluster(n_step=20, plot_kernel_space=True)
virgo_cluster.plot_cluster(n_step=20)

In [None]:
virgo_clustering = VirgoClustering(virgo_cluster, min_samples=100)
virgo_clustering.predict()
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

virgo_cluster.plot_cluster(n_step=5, plot_kernel_space=True)
virgo_cluster.plot_cluster(n_step=5)

In [None]:
virgo_clustering = VirgoClustering(virgo_cluster, n_clusters=8, clustering_type="agglo")
virgo_clustering.predict()
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

virgo_cluster.plot_cluster(n_step=1, plot_kernel_space=True)
virgo_cluster.plot_cluster(n_step=1)

In [None]:
virgo_clustering = VirgoClustering(virgo_cluster, n_clusters=10, clustering_type="spectral")
virgo_clustering.predict()
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

virgo_cluster.plot_cluster(n_step=5, plot_kernel_space=True)
virgo_cluster.plot_cluster(n_step=5)

In [None]:
virgo_clustering = VirgoClustering(virgo_cluster, min_samples=10, clustering_type="dbscan")
virgo_clustering.predict()
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

virgo_cluster.plot_cluster(n_step=5, plot_kernel_space=True)
virgo_cluster.plot_cluster(n_step=5)

In [None]:
import matplotlib.pyplot as plt

In [None]:
elbos = []
bics = []
for i in range(2, 45, 3):
    virgo_mixture = VirgoMixture(virgo_cluster, n_comp=i, mixture_type="gaussian")
    elbo = virgo_mixture.fit()
    elbos.append(elbo)
    bic = virgo_mixture.model.bic(virgo_cluster.scaled_data)
    bics.append(bic)
    print(i, elbo, bic)

print(elbos)
print(bics)

In [None]:
plt.plot(elbos)
plt.show()   

In [None]:
plt.plot(bics)
plt.show()  

In [None]:
from sklearn.cluster import OPTICS, DBSCAN
import matplotlib.pyplot as plt
import numpy as np

In [None]:
X = virgo_cluster.scaled_data[::30]
clustering = OPTICS(min_samples=50)
# clustering = DBSCAN(eps=0.4, min_samples=50).fit(X)
pred_y1 = clustering.fit_predict(X)


# fig = plt.figure(figsize=(8, 8))
# ax = fig.add_subplot(projection='3d')
# # ax.scatter(sub_data.T[0], sub_data.T[1], sub_data.T[2])
# ax.scatter(X.T[0], X.T[1], X.T[2], c=pred_y1, marker=".", cmap="plasma")
# plt.show()

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection='3d')
# ax.scatter(sub_data.T[0], sub_data.T[1], sub_data.T[2])
ax.scatter(X.T[0], X.T[1], X.T[2], c=pred_y1, marker=".", cmap="plasma")
plt.show()

In [None]:
np.unique(pred_y1, return_counts=True)

In [None]:
import matplotlib.pyplot as plt

In [None]:
elbos = []
bics = []
for i in range(2, 25, 3):
    virgo_mixture = VirgoMixture(virgo_cluster, n_comp=i, mixture_type="gaussian")
    elbo = virgo_mixture.fit()
    elbos.append(elbo)
    bic = virgo_mixture.model.bic(virgo_cluster.scaled_data)
    bics.append(bic)
    print(i, elbo, bic)

print(elbos)
print(bics)
plt.plot(elbos)
plt.show()   
plt.plot(bics)
plt.show()  

In [None]:
virgo_mixture.predict(remove_uncertain_labels=True)
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

### Gaussian mixture fit model

We are using a Gaussian mixture model to classify the data. Te VirgoMixture class currently has a GaussianMixture model with fixed number of components and a BayesianGaussianMixture model with a Dirichlet process prior to downweight unneeded components. We currently emply the former as default for the time being.

The evidence lower bound is returned as goodness-of-fit measure and the component weights can be called from the model as attribute.

Calling the predict() method without any data as input, automatically sets the labels for the entire dataset in the VirgoCluster. The option to remove labels with a probability belong 95% is also there, but not called on default. The threshhold can be changed as an input parameter as well.

In [None]:
virgo_mixture = VirgoMixture(virgo_cluster, n_comp=25, mixture_type="bayesian_gaussian")
# virgo_mixture = VirgoMixture(virgo_cluster, n_comp=12)
elbo = virgo_mixture.fit()

print(f"ELBO: {elbo}")
print(f"Mixture weights {virgo_mixture.model.weights_}")

virgo_mixture.predict(remove_uncertain_labels=True)
labels_removed = virgo_cluster.get_labels(return_counts=True)
print(labels_removed)

### Visualization 

VirgoCluster has a general plotting method plot_cluster() to visualize the fitted data. Specific labels can be called via list input. "Removed" uncertain labels are automatically not shown, but can be switched on again. Maker size is also an input parameter.

In [None]:
virgo_cluster.plot_cluster(n_step=50, plot_kernel_space=True)
virgo_cluster.plot_cluster(n_step=50)

In [None]:
virgo_cluster.plot_cluster(n_step=25, cluster_label=[0, 1, 2, 3])

In [None]:
virgo_cluster.plot_cluster(n_step=10, remove_uncertain=False, cluster_label=[-1])

### Cleaning

We can further clean the resultign clusters by either further separating a cluster by checking with a two component GaussianMixture fit or by removing low density clusters who are of low interest to our problem. The latter is more stable for the time being, as both rely on an emiprical parameter, but the desnity cut is physically motivated and easier to verify.

Relabeling due to cluster size ist called on default, but can be set to False.

In [None]:
virgo_cluster.plot_cluster(n_step=50)
virgo_cluster.get_labels(return_counts=True)

In [None]:
d_cleaner = LowDensityCleaner(virgo_cluster, 1e-10)
d_cleaner.clean()
print(virgo_cluster.get_labels(return_counts=True))
virgo_cluster.plot_cluster(n_step=50)

In [None]:
virgo_cluster.plot_cluster(n_step=25, cluster_label=[0, 1, 2, 3, 4])