In [1]:
from astrosite_dataset import AstrositeDataset

# Point this to your local copy of the Astrosite dataset that's available on the NAS
dataset_path = 'data/astrosite/recordings'
dataset = AstrositeDataset(dataset_path)

In [6]:
from itertools import islice
import pandas as pd

# Initialize an empty list to collect the data
data = []
skipped_samples = 0

# Iterate over the dataset and collect the statistics
for sample in islice(dataset, 5000):
    labels = sorted(list(set(sample['labelled_events']['label'])))
    if labels[0] < -1:
        skipped_samples += 1
        # print(f"Skipping sample with labels {labels}")
        continue
    row = {
        'Events': sample['events'].shape[0],
        'Events Labelled': sample['labelled_events'].shape[0],
        # 'Events Labelled On/Off Ratio': sample['labelled_events']['on'].astype(float).mean(),
        'Events Labelled Density': sample['labelled_events'].shape[0] / (sample['labelled_events']['t'][-1] - sample['labelled_events']['t'][0]) * 1e6,
        'Label Ratios': sample['labelled_events'].shape[0] / sample['events'].shape[0],
        'Duration (s)': (sample['events']['t'][-1] - sample['events']['t'][0]) / 1e6,
        'Duration Labelled (s)': (sample['labelled_events']['t'][-1] - sample['labelled_events']['t'][0]) / 1e6,
        'Target id': sample['target_id'],
        'Labels': labels[-1],
        'Intrinsic magnitude': sample['recording_data']['object']['intrinsic_magnitude'],
        'Maximum magnitude': sample['recording_data']['object']['maximum_magnitude'],
    }
    data.append(row)

df = pd.DataFrame(data)

In [27]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Histogram of Events Labelled",
        "Durations vs Label Ratios",
        "Intrinsic vs Max Magnitude",
        "Number of samples per target IDs"
    )
)

# Add Histogram of Events Labelled
fig.add_trace(
    go.Histogram(x=df['Events Labelled'], name='Events Labelled'),
    row=1, col=1
)

# Add Scatter plot of Durations vs Label Ratios
fig.add_trace(
    go.Scatter(x=df['Duration Labelled (s)'], y=df['Label Ratios'], mode='markers', name='Durations (s) vs Label Ratios'),
    row=1, col=2
)

# Add Scatter plot of Event Labelled On/Off Ratio vs Maximum Magnitude
fig.add_trace(
    go.Scatter(x=df['Intrinsic magnitude'], y=df['Maximum magnitude'], mode='markers', name='Intrinsic vs Max Magnitude'),
    row=2, col=1
)

# Assuming df['Target id'].value_counts().to_numpy() is stored in target_id_counts
target_id_counts = df['Target id'].value_counts().to_numpy()
target_id_labels = df['Target id'].value_counts().index.to_numpy()

# Add Bar plot of Target IDs
fig.add_trace(
    go.Bar(x=target_id_labels, y=target_id_counts, name='Target IDs'),
    row=2, col=2
)

# Update layout
fig.update_layout(height=800, width=1200, title_text="Dataset Statistics")

# Show plot
fig.show()