In [None]:
%matplotlib inline

# Gaia data visualization using voila and vaex
This notebook shows how to create a simple interactive dashboard showing a Hertzsprung-Russell (HR) diagram, which [can be rendered as dashboard using Voila](https://voila.vaex.io/voila/render/gde-examples/20-voila-vaex-hr-diagram.ipynb).

For this, we use the following Data/Software:

 * <a href="https://gea.esac.esa.int/archive/">Gaia</a> EDR3 dataset, crossmatched to the <a href="https://panstarrs.stsci.edu/">Pan-STARRS</a> dataset (crossmatched done by Sergey Kopsov).
 * [Voila](https://voila.readthedocs.io/) to render as dashboard.
 * [Vaex](https://github.com/vaexio/vaex) to do the data aggregation.
 * [ipywidgets](https://ipywidgets.readthedocs.io/) to make this interactive.

In [None]:
import ipywidgets as widgets
import matplotlib.pylab as plt
import vaex
import vaex.jupyter
import numpy as np
import getpass


In [None]:
df = vaex.open('ws://dataframe-dev.vaex.io/gaia-edr3-x-ps1')[:10_000_000]

In [None]:
print(f'The full dataset contains {len(df):,} stars')

In [None]:
columns_all = df.get_column_names()
columns_colors = [col for col in columns_all if "mean_mag" in col ]
print("We found the following photometry columns in the Gaia dataset:\n\t" + "\n\t".join(columns_colors))

In [None]:
df['distance'] = 1/df.parallax
df['M_g'] = df.phot_g_mean_mag - 5 * np.log10(df.distance) - 10

# Filtering the data
We take only a subset of the data, where ${\varpi \over \sigma_\varpi} > 5$, meaning we cut out negative distances, and very uncertain distances.

In [None]:
# Filter the dataset, ignore negative or very uncertain distances
df = df[df.parallax_over_error>5] 
print(f'The filtered dataset contains {len(df):,} stars')

# Interactive Hertzsprung-Russell diagram
Below, we show an [Hertzsprung Russell](https://en.wikipedia.org/wiki/Hertzsprung%E2%80%93Russell_diagram) where you can choose the colors, an extra expression to filter by, and decide to plot the log density or just density.

In [None]:
extra_filter = "distance < 10"
df.select(extra_filter)

In [None]:
limits_color = [-2, 4]
limits_magnitude = [20, -7]
last_filter = extra_filter

@widgets.interact_manual(magnitude1=columns_colors, magnitude2=columns_colors, manual_name="test")
def plot(magnitude1=columns_colors[1], magnitude2=columns_colors[0], log=True, extra_filter=extra_filter):
    # step1: validate/setup
    global last_filter
    try:
        df.validate_expression(extra_filter)
    except Exception as e:
        print("oops, invalid expression: " +str(e))
        return
    
    # a performance trick, if we do not change the selection
    # vaex will use the selection cache
    if extra_filter != last_filter:
        df.select(extra_filter)
    last_filter = extra_filter

    # step2: bin the data
    color = df[magnitude1] - df[magnitude2]
    # instead of calling plot, we call count manually, so we can count the number
    # of stars in the plot.
    counts = df.count(binby=[color, df.M_g], limits=[limits_color, limits_magnitude], shape=128,
                        selection=True)
    
    # step3: draw the data using matplotlib

    extent = [*limits_color, *limits_magnitude]
    image = np.log1p(counts.T) if log else counts.T
    N = counts.sum()

    plt.figure(figsize=(8, 8))
    plt.imshow(image, origin='lower', extent=extent, aspect='auto')
    plt.xlabel(str(color))
    plt.ylabel("$M_g$")
    plt.title(f'This plot contains {N:,} stars')
    plt.show()

In [None]:
# Manually trigger the first time
plot.widget.children[-2].click()
plot.widget.children[-2].description = "Draw HR diagram"