# Post-processing

We will modify each parquet file in place. This seems like a good idea today, but could be crap tomorrow.

If we use LSDB, we will need to use additional disk storage, both for fresh and post-processed data.

Elements of post-processing to be accomplished in this notebook:

* brightness in magnitude (e.g. convert ALL flux to magnitude)
* join to visit table, where necessary
* de-duplicate object and source tables (report shrinkage)
* reduce number of columns of object tables?
* order source tables by object id, then timestamp

In [24]:
from pathlib import Path
import astropy.units as u
from tqdm import tqdm
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

# HATS/LSDB
import lsdb
import hats
from hats.io import file_io, paths

In [3]:
base_output_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/dm_48556")
hats_dir = base_output_dir /  "hats"

In [13]:
def append_mag_and_magerr(table, flux_col_prefixes):
    """Calculate magnitudes and their errors for flux columns."""
    mag_cols = {}
    
    for prefix in flux_col_prefixes:
        # Magnitude
        flux = table[f"{prefix}Flux"]
        mag = u.nJy.to(u.ABmag, flux)
        mag_cols[f"{prefix}Mag"] = mag

        # Magnitude error, if flux error exists
        fluxErr_col = f"{prefix}FluxErr"
        if fluxErr_col in table.columns:
            fluxErr = table[fluxErr_col]
            upper_mag = u.nJy.to(u.ABmag, flux+fluxErr)
            lower_mag = u.nJy.to(u.ABmag, flux-fluxErr)
            magErr = -(upper_mag-lower_mag)/2
            mag_cols[f"{prefix}MagErr"] = magErr
        
    mag_table = pd.DataFrame(mag_cols, dtype=np.float64, index=table.index)
    return pd.concat([table, mag_table], axis=1)

## diaObject

This one is the easiest because it doesn't require ANY post-processing!!!

## diaSource

In [34]:
cat = hats.read_hats(hats_dir / "diaSource")
# cat.schema

In [21]:
file_path = paths.pixel_catalog_file(cat.catalog_path, cat.get_healpix_pixels()[0])
table = pd.read_parquet(file_path)

In [33]:
len(table["diaSourceId"])

304641

In [36]:
values, index, counts = np.unique(table["diaSourceId"], return_index=True, return_counts=True)


In [39]:
# np.argmax(counts)
values[193]


191403211800707126

In [42]:
table.query("diaSourceId == 191403211800707126").to_csv("duplicate_diaSource.csv")

In [22]:
flux_col_prefixes = []
for flux_name in ["psf","science"]:
    if f"{flux_name}Flux" in table.columns:
        flux_col_prefixes.append(flux_name)
print(flux_col_prefixes)
if len(flux_col_prefixes) > 0:
    table = append_mag_and_magerr(table, flux_col_prefixes)


  return dex.to(self._function_unit, np.log10(x))


In [25]:
pq.write_table(pa.Table.from_pandas(
                        table, preserve_index=False
                    ).replace_schema_metadata(), file_path.path)