In [34]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from nested_pandas.utils import count_nested

import os
from numba import njit
import lsdb
import hats

from upath import UPath

import matplotlib.pyplot as plt

In [35]:
def calcSF_fixed_bins(tclip, fclip, bin_edges):
    """
    Compute the structure function (SF) using predefined time lag bins.
    Vectorized pairwise differences + Numba-accelerated binning.
    """
    # Compute pairwise time and magnitude differences (upper triangle)
    dt_matrix = np.abs(tclip[:, None] - tclip[None, :])
    df2_matrix = (fclip[:, None] - fclip[None, :])**2

    i_upper, j_upper = np.triu_indices(len(tclip), k=1)
    all_dt = dt_matrix[i_upper, j_upper]
    all_df2 = df2_matrix[i_upper, j_upper]

    # Use Numba for fast binning
    return _compute_binned_sf(all_dt, all_df2, bin_edges)


def _compute_binned_sf(all_dt, all_df2, bin_edges):
    n_bins = len(bin_edges) - 1
    tau_median = np.empty(n_bins)
    SF = np.empty(n_bins)
    N = np.empty(n_bins, dtype=np.int64)

    for i in range(n_bins):
        lo = bin_edges[i]
        hi = bin_edges[i + 1]

        # Count how many points are in the bin first
        count = 0
        for j in range(len(all_dt)):
            if lo <= all_dt[j] < hi:
                count += 1

        N[i] = count

        if count > 0:
            dt_bin = np.empty(count)
            df2_bin = np.empty(count)

            idx = 0
            for j in range(len(all_dt)):
                if lo <= all_dt[j] < hi:
                    dt_bin[idx] = all_dt[j]
                    df2_bin[idx] = all_df2[j]
                    idx += 1

            tau_median[i] = np.median(dt_bin)
            SF[i] = np.sqrt(np.mean(df2_bin)) / np.sqrt(2)
        else:
            tau_median[i] = np.nan
            SF[i] = np.nan

    return {'lc.tau_median':tau_median, 'lc.SF':SF, 'lc.N':N, 'lc.time_step':np.arange(len(bin_edges) - 1)}

In [36]:
!aws s3 ls s3://rubin-lincc-hats/hats/w_2025_18/ 


In [37]:
!aws s3 ls s3://rubin-lincc-hats/hats/w_2025_19/ 

In [38]:
!aws s3 ls s3://rubin-lincc-hats/hats/w_2025_19/dia_object 

In [39]:

# If the AWS_ENDPOINT_URL is not set, add the following to UPath:
# endpoint_url=os.env["LSST_RESOURCES_S3_PROFILE_embargo"]
dia_object_lc = lsdb.read_hats(UPath("s3://rubin-lincc-hats/hats/w_2025_19/dia_object_collection"), 
                               columns = ["diaObjectId", "ra", "dec", 'tract', "diaObjectForcedSource"])
dia_object_lc_hats = hats.read_hats(UPath("s3://rubin-lincc-hats/hats/w_2025_19/dia_object_collection")).main_catalog

dia_object_lc

In [40]:
from lsdb.core.search import ConeSearch
dia_object_lc = lsdb.read_hats(UPath("s3://rubin-lincc-hats/hats/w_2025_19/dia_object_collection"), 
                               columns = ["diaObjectId", "ra", "dec", 'tract', "diaObjectForcedSource"],
                                search_filter = ConeSearch(ra=182, dec=7.0, radius_arcsec=3 * 3600)
)
dia_object_lc

In [41]:
dia_object_lc_hats.plot_moc()

In [42]:
test = dia_object_lc.head(100)
test.iloc[1].diaObjectForcedSource
test.diaObjectForcedSource.nest.fields

In [43]:
dia_object_lc_computed = dia_object_lc.compute()

In [44]:
len(dia_object_lc_computed)

In [45]:
# Identify flag columns
flag_cols = [col for col in dia_object_lc_computed.diaObjectForcedSource.nest.fields if 'flag' in col.lower()]

# Build the condition string, e.g., "flag1 == False & flag2 == False & ..."
query_str = " & ".join([f"diaObjectForcedSource.{col} == False" for col in flag_cols])
query_str = query_str + '& diaObjectForcedSource.band == "r"'
# Apply query directly to the nested column
dia_object_lc_computed_filtered = dia_object_lc_computed.query(query_str)
dia_object_lc_computed_filtered = dia_object_lc_computed_filtered.dropna(subset = 'diaObjectForcedSource.psfMag')
dia_object_lc_computed_filtered = dia_object_lc_computed_filtered.dropna(subset = 'diaObjectForcedSource')
dia_object_lc_computed_filtered = count_nested(dia_object_lc_computed_filtered, nested='diaObjectForcedSource') 
dia_object_lc_computed_filtered = dia_object_lc_computed_filtered[dia_object_lc_computed_filtered['n_diaObjectForcedSource'] > 5]

In [46]:
bin_edges= np.arange(-0.5, 10, 1.0)
bin_edges

In [47]:
result = dia_object_lc_computed_filtered.reduce(calcSF_fixed_bins, "diaObjectForcedSource.midpointMjdTai", "diaObjectForcedSource.psfMag", bin_edges=bin_edges)
dia_object_lc_computed_filtered = dia_object_lc_computed_filtered.join(result, how='left')

In [None]:
result = dia_object_lc_computed_filtered.reduce(np.median, 'diaObjectForcedSource.psfMag').rename(columns={0:'median_psfMag'})
dia_object_lc_computed_filtered = dia_object_lc_computed_filtered.join(result, how='left')

In [56]:
dia_object_lc_computed_filtered.head(6)

In [58]:
dia_object_lc_computed_filtered.to_parquet('dia_object_lc_computed_filtered.parquet')

In [50]:
single_mag_flat = single_mag['lc'].nest.to_flat()

In [68]:
single_mag_flat

In [69]:
import numpy as np
import pandas as pd

results = []

# Loop over magnitude bins
for mag_bin in tqdm(range(15, 25)):  # 15 to 24 inclusive
    # Filter by magnitude bin
    mag_mask = (
        (dia_object_lc_computed_filtered['median_psfMag'] >= mag_bin) &
        (dia_object_lc_computed_filtered['median_psfMag'] < mag_bin + 1)
    )
    single_mag = dia_object_lc_computed_filtered[mag_mask]

    if single_mag.empty:
        continue

    # Flatten nested structure
    single_mag_flat = single_mag['lc'].nest.to_flat()

    for i in range(10):  # Loop over time_step
        subset = single_mag_flat[single_mag_flat['time_step'] == i]
        subset = subset[subset['N'] > 10] 
        if not subset.empty:
            tau_vals = subset['tau_median'].dropna()
            sf_vals = subset['SF'].dropna()

            if len(tau_vals) > 0 and len(sf_vals) > 0:
                tau_med = np.median(tau_vals)
                sf_med = np.median(sf_vals)
                sf_q16 = np.quantile(sf_vals, 0.16)
                sf_q84 = np.quantile(sf_vals, 0.84)

                results.append((
                    mag_bin,  # add mag_bin info
                    i,
                    tau_med,
                    sf_med,
                    sf_q16,
                    sf_q84
                ))

# Create results DataFrame
result_df = pd.DataFrame(results, columns=[
    'mag_bin', 'time_step', 'tau_median', 'SF_median', 'SF_16', 'SF_84'
])

# Drop rows with any remaining NaNs (just in case)
result_df = result_df.dropna()

result_df

In [60]:
result_df.columns

In [70]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(10, 6))

# Define a color map for consistent coloring (optional)
colors = plt.cm.viridis(np.linspace(0, 1, len(result_df['mag_bin'].unique())))

for idx, mag_bin in enumerate(sorted(result_df['mag_bin'].unique())):
    single_bin = result_df[result_df['mag_bin'] == mag_bin]

    # Apply a small offset to x-values to avoid overlapping points
    offset = (idx - 5) * 0.06  # Centered around zero
    tau_offset = single_bin['tau_median'] + offset

    label = f"{mag_bin}–{mag_bin+1} mag"

    plt.errorbar(
        tau_offset,
        single_bin['SF_median'],
        yerr=[
            single_bin['SF_median'] - single_bin['SF_16'],
            single_bin['SF_84'] - single_bin['SF_median']
        ],
        fmt='o',
        capsize=2,
        label=label,
        alpha=0.8,
        color=colors[idx]
    )

plt.xticks(np.arange(0, 11, 1))
plt.xlim(-0.5, 10)
plt.xlabel("Time lag τ (days)", fontsize=12)
plt.ylabel("Structure Function SF (mag)", fontsize=12)
plt.title("Structure Function by Magnitude Bin", fontsize=14)
plt.grid(True, which="both", ls="--", lw=0.5)
plt.legend(title="Mag Bin", fontsize=10, loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

In [52]:
# from dask.distributed import Client

# Start with a small client
# client = Client(n_workers=24, memory_limit="3GB", threads_per_worker=1)
# client.close()

In [65]:
dia_object_lc

In [53]:
single_df = dia_object_lc._ddf.get_partition(3).compute()