# Source clustering

This notebook clusters sources from nightly validation to generate object light curves.

In [1]:
import lsdb
import matplotlib.pyplot as plt
import pandas as pd
import tempfile

import lsst.daf.butler as dafButler
from lsst.summit.utils import ConsDbClient

from dask.distributed import Client
from lsdb.core.search import ConeSearch
from pathlib import Path
from tqdm import tqdm

import astropy.units as u


pd.set_option('display.max_rows', 100)

In [2]:
#%pip install git+https://github.com/astronomy-commons/lsdb.git@sean/nested-crossmatch
base_output_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/tmp")

In [3]:
def flux_to_mag(sciFlux):
    """Move flux into magnitudes
    
    Parameters
    ----------
    sciFlux : `float`
        Science flux

    Returns
    -------
    mag  : `float`
        Magnitude
    """
    
    mag = u.nJy.to(u.ABmag, sciFlux)
    
    return mag


### Query for all recent visits

First let's get all the visits from April 18 to 20.

In [4]:
start_day_obs, end_day_obs = 20250418, 20250423

In [5]:
with open("/sdf/home/n/ncaplar/token-file", "r") as f:
    token = f.read()
client = ConsDbClient(f"https://user:{token}@usdf-rsp.slac.stanford.edu/consdb")
visits = client.query(f"SELECT * FROM cdb_lsstcam.visit1 WHERE day_obs >= {start_day_obs} AND day_obs <= {end_day_obs} and science_program = 'BLOCK-365'").to_pandas()

In [6]:
num_visits = len(visits)
print(f"Found {num_visits} visits from {start_day_obs} to {end_day_obs}")

### Initialize the Butler

In [7]:
butler = dafButler.Butler("embargo")  # or your Butler repo path

# Query all collections starting with the desired prefix
all_collections = list(butler.registry.queryCollections("LSSTCam/runs/nightlyValidation/2025*"))

# Filter: keep only those where the final path part ends with '7'
filtered_collections = [c for c in all_collections if str(c).strip()[-1] == "7"]

# Optional: print them sorted
for coll in sorted(filtered_collections):
    print(coll)

In [8]:
repo = "embargo"
instrument = "LSSTCam"
collection_all = "LSSTCam/runs/nightlyValidation"
butler = dafButler.Butler(repo, collections=filtered_collections, instrument=instrument)

### Create object table

In [9]:
# Find visit of best dimm_seeing
visits = visits.sort_values("dimm_seeing")
visits = visits[~visits["dimm_seeing"].isna()]
visit_best_dimm_seeing = visits.iloc[0]
visit_best_dimm_seeing

In [10]:
# Find the name of the collection for the day_obs: 20250418
day_obs = visit_best_dimm_seeing["day_obs"]
day_collection = butler.registry.queryCollections(f"LSSTCam/runs/nightlyValidation/{day_obs}*7")[0]
day_collection

In [11]:
butler = dafButler.Butler(repo, collections=day_collection, instrument=instrument)
object_df = butler.get('single_visit_star', visit=visit_best_dimm_seeing["visit_id"], instrument=instrument).to_pandas()
object_df

Let's transform this object dataframe into a HATS catalog:

In [12]:
object_cat = lsdb.from_dataframe(object_df)
# There is a bug using the from_dataframe output directly:
# A workaround is to save the catalog to transient storage and load it back
object_cat.to_hats(base_output_dir / "object", overwrite=True)


In [13]:
object_cat = lsdb.read_hats(base_output_dir / "object")
object_cat

In [14]:
object_cat.plot_pixels()

### Query for all sources

Let's query the Butler to get the sources for all the visits.

In [15]:
def _get_butler_for_day(day_obs):
    day_collection = butler.registry.queryCollections(f"LSSTCam/runs/nightlyValidation/{day_obs}*7")[0]
    return dafButler.Butler(repo, collections=filtered_collections, instrument=instrument)

def _filter_source_df(df):
    # Filter non-primary detections
    df = df[df['detect_isPrimary']]
    # Those with invalid coord_ra
    df = df.dropna(subset=["coord_ra"])
    # Or the fake detections 
    df = df[df['sky_source'] == False]
    # Cut only to "i" band
    df = df[df["band"] == "i"]
    # Reduce number of columns (for efficiency)
    return df[["ra","dec","sourceId","band","psfFlux","psfFluxErr"]]

def get_sources_for_day(day_visits):
    try:
        # Initialize butler for current day
        day_obs = day_visits["day_obs"].iloc[0]
        day_butler = _get_butler_for_day(day_obs)
        ids, mjds = day_visits["visit_id"], day_visits["exp_midpt_mjd"]

        day_dfs = []
        # Get the sources for each visit
        for visit_id, visit_mjd in tqdm(zip(ids, mjds)):
            try:
                df = day_butler.get(
                    'single_visit_star', visit=visit_id, instrument=instrument
                ).to_pandas()
                df = _filter_source_df(df)
                if not df.empty:
                    df["visit_id"] = visit_id
                    df["mjd"] = visit_mjd
                    day_dfs.append(df)
            except Exception as e:
                print(f"Skipping visit {visit_id} due to error: {e}")

        print(f"Loaded {len(day_dfs)} dataframes from {day_collection}")
        return pd.concat(day_dfs, ignore_index=True)
    except:
        pass

In [None]:
# Collect only non-empty DataFrames
all_dfs = []
for _, day_visits in visits.groupby("day_obs"):
    df = get_sources_for_day(day_visits)
    if df is not None and not df.empty:
        all_dfs.append(df)

# Concatenate only if there's something to concatenate
if all_dfs:
    sources_df = pd.concat(all_dfs, ignore_index=True)
else:
    sources_df = pd.DataFrame()  # Empty fallback

sources_df

In [None]:
# Import with lsdb
source_cat = lsdb.from_dataframe(sources_df)
# There is a bug using the from_dataframe output directly:
# A workaround is to save the catalog to transient storage and load it back
source_cat.to_hats(base_output_dir / "source", overwrite=True)
#source_cat = lsdb.read_hats(base_output_dir / "source")
#source_cat

In [15]:
source_cat = lsdb.read_hats(base_output_dir / "source")
source_cat

Let's remove the few sources that are distant from the main cluster:

In [16]:
source_cat.plot_pixels()
cone = ConeSearch(ra=218, dec=-15, radius_arcsec=12*3600)
cone.plot(fc="#00000000", ec="red")

In [17]:
source_cat = source_cat.cone_search(ra=cone.ra, dec=cone.dec, radius_arcsec=cone.radius_arcsec)
source_cat

### Construct light curves

In [18]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir)
client

In [19]:
# Get light curves for the catalog
lc_cat = object_cat.crossmatch_nested(source_cat, radius_arcsec=0.2, n_neighbors=num_visits, nested_column_name="lc")
lc_cat

In [20]:
# Took roughly 30sec
object_lc = lc_cat.reduce(lambda mjd: {"nobs": mjd.size}, "lc.mjd", meta={"nobs": int}, append_columns=True)
object_lc = object_lc.query("nobs > 10")
object_lc = object_lc.compute()
object_lc.head()

In [21]:
client.close()
tmp_path.cleanup()

### Plot light curves

In [22]:
# Grab a single light curve
lc = object_lc.iloc[10]["lc"].sort_values("mjd")
lc

In [23]:
COLORS = {
    "u": "#56b4e9",
    "g": "#009e73",
    "r": "#f0e442",
    "i": "#cc79a7",
    "z": "#d55e00",
    "y": "#0072b2",
}

def plot_rubin_lc(lc, flux_col, fluxerr_col):
    _, ax = plt.subplots()
    for band, color in COLORS.items():
        band_lc = lc.query(f"band == '{band}'")
        flux, fluxerr = band_lc[flux_col], band_lc[fluxerr_col]
        ax.errorbar(
            band_lc["mjd"],
            flux,
            fluxerr,
            fmt="o",
            label=band,
            color=color,
            alpha=1,
            markersize=5,
            capsize=3,
            elinewidth=1,
        )
    ax.set_xlabel("MJD")
    ax.set_ylabel("Flux")
    ax.invert_yaxis()
    ax.legend(loc="lower right", fontsize=12)

plot_rubin_lc(lc, "psfFlux", "psfFluxErr")

# Find RRLyrae from GAIA

In [24]:
import pandas as pd
import re
from io import StringIO

file_path = "/sdf/home/n/ncaplar/asu-txt.txt"

# Read lines excluding comments and blank lines
with open(file_path, "r") as f:
    lines = [line.rstrip("\n") for line in f if not line.startswith("#") and line.strip()]

# Find header line (with units) and the next two lines (column names + dashes)
for i, line in enumerate(lines):
    if "RAJ2000" in line and "(" in line:
        header_index = i
        break

column_line = lines[header_index ]
dash_line = lines[header_index + 1]

# Clean column names by splitting on 2+ spaces
column_names = re.split(r'\s{2,}', column_line.strip())

# Prepare the data block (after dashed line)
data_lines = lines[header_index + 3:]
data_text = "\n".join(data_lines)
column_names = ['RA', 'DEC', 'Harm', 'Source', 'SolId', 'PF', 'P10', 'gavg', 'agavg', 'agavg2', 'M/H', 'R21G', 'R31G', 'i21G', 'phi31G', 'FundFreq1', 'FundFreq2', 'lass']

# Read as fixed-width with correct column names
df_RR_Lyrae = pd.read_fwf(StringIO(data_text), header=None, names=column_names)

print("Column names:", df_RR_Lyrae.columns.tolist())


In [25]:
RR_Lyrae_nightly = lsdb.crossmatch(df_RR_Lyrae[['RA','DEC']],object_lc)

In [26]:
object_lc['RA'] = object_lc['coord_ra']
object_lc['DEC'] = object_lc['coord_dec']

In [27]:
object_lc.head(5)

In [28]:
RR_Lyrae_nightly_computed = lsdb.crossmatch(df_RR_Lyrae[['RA','DEC','PF','P10']], object_lc[['RA',	'DEC','sourceId']]).compute()

In [29]:
id_RRLyrae_in_nightly = RR_Lyrae_nightly_computed['sourceId_right'].values

In [30]:
RRLyrae_lc = object_lc[object_lc['sourceId'].isin(id_RRLyrae_in_nightly)]

In [31]:
# RR_Lyrae_nightly_computed

In [32]:
RRLyrae_lc.iloc[10]['sourceId']

In [33]:
period_par_for_single_lc = RR_Lyrae_nightly_computed[RR_Lyrae_nightly_computed['sourceId_right'] == RRLyrae_lc.iloc[10]['sourceId']][['PF_left','P10_left']]

In [34]:
def create_mag_errors(sciFlux, sciFluxErr):
    """Move flux into magnitudes and calculate the error on the magnitude
    
    Parameters
    ----------
    sciFlux : `float`
        Science flux
    sciFluxErr : `float`
        Science flux error

    Returns
    -------
    mag, magerr  : `float`, `float`
        Magnitude and magnitude error
    """
    
    mag = u.nJy.to(u.ABmag, sciFlux)
    upper_mag = u.nJy.to(u.ABmag, sciFlux+sciFluxErr)
    lower_mag = u.nJy.to(u.ABmag, sciFlux-sciFluxErr)
    magErr = -(upper_mag-lower_mag)/2
    
    return mag, magErr

In [35]:
create_mag_errors(lc['psfFlux'].values, lc['psfFluxErr'].values)[1]

In [36]:
RR_Lyrae_nightly_computed

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Define sinusoid model: phase in [0, 1]
def sinusoid(phase, A, phi0, mean_mag):
    return mean_mag + A * np.sin(2 * np.pi * phase + phi0)

for i in range(len(RRLyrae_lc)):
    row = RRLyrae_lc.iloc[i]
    lc = row.lc
    source_id = row['sourceId']

    # Retrieve period
    period_row = RR_Lyrae_nightly_computed[RR_Lyrae_nightly_computed['sourceId_right'] == source_id]
    if period_row.empty:
        continue

    period = period_row['PF_left'].values[0]
    if pd.isna(period) and 'P10_left' in period_row.columns:
        period = period_row['P10_left'].values[0]
    if pd.isna(period):
        continue

    # Convert flux to magnitude
    mag = flux_to_mag(lc['psfFlux'].values)
    mag_err = create_mag_errors(lc['psfFlux'].values, lc['psfFluxErr'].values)[1]

    # Time and phase
    time = lc['mjd'].values
    phase = (time % period) / period

    # RA/Dec for title
    ra = row['coord_ra']
    dec = row['coord_dec']

    # Fit sinusoid on phase-folded data
    valid = np.isfinite(phase) & np.isfinite(mag)
    try:
        popt, _ = curve_fit(sinusoid, phase[valid], mag[valid],
                            p0=[0.5, 0.0, np.nanmean(mag)],
                            sigma=mag_err[valid] if mag_err is not None else None,
                            absolute_sigma=True)
        A_fit, phi0_fit, mean_mag_fit = popt

        # For phase-folded panel
        phase_model = np.linspace(0, 1, 500)
        mag_model_phase = sinusoid(phase_model, A_fit, phi0_fit, mean_mag_fit)

        # For time-domain panel: unwrap phase model over time range
        t_model = np.linspace(np.min(time), np.max(time), 1000)
        phase_t_model = (t_model % period) / period
        mag_model_time = sinusoid(phase_t_model, A_fit, phi0_fit, mean_mag_fit)
    except Exception as e:
        print(f"Fit failed for sourceId {source_id} — {e}")
        continue

    # Plotting
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)

    # Time-domain plot
    axes[0].errorbar(time, mag, yerr=mag_err, fmt='o', markersize=3, alpha=0.7, capsize=3,color='black', label='Observed, i-band')
    axes[0].plot(t_model, mag_model_time, 'r--', lw=1, label='Fit')
    axes[0].invert_yaxis()
    axes[0].set_xlabel("MJD")
    axes[0].set_ylabel("Magnitude")
    axes[0].legend(loc='upper right')
    # Add space for legend
    ymin, ymax = axes[0].get_ylim()
    axes[0].set_ylim(ymin - 0.2 * (ymax - ymin), ymax + 0.2 * (ymax - ymin))

    # Phase-folded plot
    axes[1].errorbar(phase, mag, yerr=mag_err, fmt='o', markersize=3, alpha=0.7,capsize=3,color='black', label='Observed, i-band')
    axes[1].plot(phase_model, mag_model_phase, 'r--', lw=1, label='Fit')
    axes[1].invert_yaxis()
    axes[1].set_xlabel("Phase")
    axes[1].set_xlim(0, 1)
    axes[1].legend(loc='upper right')
    axes[1].set_ylim(ymin - 0.2 * (ymax - ymin), ymax + 0.2 * (ymax - ymin))
    
    fig.suptitle(f"RA={ra:.5f}, Dec={dec:.5f} / Period={period:.4f} days")
    plt.tight_layout()
    plt.subplots_adjust(top=0.85)
    plt.show()

In [None]:
row

In [None]:
plot_rubin_lc(RRLyrae_lc.iloc[10].lc, "psfFlux", "psfFluxErr")

In [None]:
        # Compute phase
        lc = lc.assign(phase=(lc.midpointMjdTai - lc.midpointMjdTai.loc[lc.psfFlux.idxmax()]) 
                       % row.Period_nice_obj / row.Period_nice_obj)