# Source clustering

This notebook clusters sources from nightly validation to generate object light curves.

In [None]:
import astropy.units as u
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lsdb

import lsst.daf.butler as dafButler

from astropy.coordinates import SkyCoord
from lsdb import ConeSearch
from pathlib import Path
from tqdm import tqdm

pd.set_option('display.max_rows', 100)

In [None]:
%pip install git+https://github.com/astronomy-commons/lsdb.git@sean/nested-crossmatch

### Querying for recent day_obs

First let's get the visit ids as well as the exposures midpoint mjd.

In [None]:
os.environ["no_proxy"] += ",.consdb"
from lsst.summit.utils import ConsDbClient
client = ConsDbClient("http://consdb-pq.consdb:8080/consdb")
visits = client.query("SELECT * FROM cdb_lsstcam.visit1 WHERE day_obs >= 20250415 and science_program = 'BLOCK-365'").to_pandas()

In [None]:
visits = visits[["visit_id","day_obs","exp_midpt_mjd","dimm_seeing"]]
visits

In [None]:
# choose 2025_04_20
single_day_visit = visits[visits["day_obs"] == 20250420]
single_day_visit

In [None]:
# find the visit with the best dim seeing
best_dim_seeing_visit = single_day_visit.iloc[np.argmin(single_day_visit["dimm_seeing"])]
best_dim_seeing_visit

### Initializing the Butler

In [None]:
repo = "embargo"
instrument = "LSSTCam"
collection_all = "LSSTCam/runs/nightlyValidation"
butler = dafButler.Butler(repo, collections=collection_all, instrument=instrument)

### Getting sources for 2025_04_20

In [None]:
# Find the name of the collection for 2025_04_20
all_collections = list(butler.registry.queryCollections("LSSTCam/runs/nightlyValidation/202504*7"))
all_collections

In [None]:
# TODO: Give all collections(all_collections instead of collection_single_day)
collection_single_day = "LSSTCam/runs/nightlyValidation/20250420/d_2025_04_19/DM-50157"
butler = dafButler.Butler(repo, collections=collection_single_day, instrument=instrument)

In [None]:
visit_dfs = []

visit_ids = single_day_visit["visit_id"].to_numpy()
visit_mjds = single_day_visit["exp_midpt_mjd"].to_numpy()
visit_dimseeings = single_day_visit["dimm_seeing"].to_numpy()

for visit, mjd, dim_seeing in tqdm(zip(visit_ids, visit_mjds, visit_dimseeings)):
    try:
        # Get all sources for visit
        df = butler.get('single_visit_star', visit=visit, instrument=instrument).to_pandas()
    
        # Do some filtering
        df = df[df['detect_isPrimary'] == True]
        df = df.dropna(subset=["coord_ra"])
        df = df[df['sky_source'] == False]
        
        # Skip if DataFrame is now empty
        if df.empty:
            print(f"Skipping visit {visit} — no valid rows left.")
            continue
            
        # Add visit_id and mjd columns
        df["visit_id"] = visit
        df["mjd"] = mjd
        df["dim_seeing"] = dim_seeing

        # Reduce number of columns (for efficiency)
        df = df[["ra","dec","sourceId","band","mjd","psfFlux","psfFluxErr","visit_id","dim_seeing"]]
        visit_dfs.append(df)
    except Exception as e:
        print(f"Skipping visit {visit} due to error: {e}")

print(f"Loaded {len(visit_dfs)} dataframes.")

In [None]:
# TODO: Select all ra dec around 217 -17
# TODO: Add cut: only i-band
# Aggregate all visits for this day
final_df = pd.concat(visit_dfs)
final_df

In [None]:
# Import with lsdb
source_cat = lsdb.from_dataframe(final_df)
source_cat

In [None]:
source_cat

In [None]:
# send v
source_cat.plot_pixels()

In [None]:
np.unique(source_cat["dim_seeing"].compute())

In [None]:
center = SkyCoord(220 * u.deg, -20 * u.deg)
fig, ax = source_cat.plot_pixels()
cone_search = ConeSearch(center.ra.deg, center.dec.deg, 200 * 60)
cone_search.plot(fc="#00000000", ec="red")

In [None]:
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning/tmp")
#source_cat.to_hats(base_output_dir / "source")

In [None]:
source_cat = lsdb.read_hats(base_output_dir / "source")
source_cat

In [None]:
# What is the visit here with the best dim_seeing?
cone_cat = source_cat.cone_search(center.ra.deg, center.dec.deg, 200 * 60)
cone_cat_df = cone_cat.compute()
visit_min_dimseeing = cone_cat_df.iloc[np.argmin(cone_cat_df["dim_seeing"])]["visit_id"]
visit_min_dimseeing

In [None]:
# TODO: Get all sources that have good dimseeing in the whole catalog
object_df = cone_cat_df[cone_cat_df["visit_id"] == visit_min_dimseeing]
object_df

In [None]:
object_cat = lsdb.from_dataframe(object_df)
object_cat.plot_pixels()

In [None]:
# Get light curves for this pixel:
# TODO: Add radius_arcsec=0.2
# TODO: n_neigbors = num_visits
result = object_cat.crossmatch_nested(cone_cat, n_neighbors=20).compute()
result

In [None]:
lc = result["from_lsdb_dataframe"].iloc[1].sort_values("mjd")
lc

In [None]:
COLORS = {
    "u": "#56b4e9",
    "g": "#009e73",
    "r": "#f0e442",
    "i": "#cc79a7",
    "z": "#d55e00",
    "y": "#0072b2",
}

def _plot_rubin_lc(ax, lc, mag_col, magerr_col, x_name, x_label):
    mag_values = []
    for band, color in COLORS.items():
        band_lc = lc.query(f"band == '{band}'")
        mag, magerr = band_lc[mag_col], band_lc[magerr_col]
        ax.errorbar(
            band_lc[x_name],
            mag,
            magerr,
            fmt="o",
            label=band,
            color=color,
            alpha=1,
            markersize=5,
            capsize=3,
            elinewidth=1,
        )
        mag_values.extend(mag.dropna().values)
    ax.set_xlabel(x_label)
    ax.set_ylabel("Magnitude (AB)")
    ax.invert_yaxis()
    ax.legend(loc="lower right", fontsize=12)

fig, ax = plt.subplots()
_plot_rubin_lc(ax, lc, "psfFlux", "psfFluxErr", "mjd", "mjd")

In [None]:
plt.scatter(lc["mjd"], lc["psfFlux"])