# Create Index Files

Author: Melissa

This script has a few useful bits:

- Create batches of files to import
   - These will ideally be close to each other (e.g. by tract)
   - These will also contain points from the reference id (which are not included in the parquet files)
- Get the original file sizes and many other fun data points for further debugging
- Estimate the pixel thresholds for the various table types.
   - NB: this analysis was run at the beginning of comcam testing, and we've used those values in the subsequent notebook.

In [1]:
# Generic python packages
import os

# LSST Science Pipelines (Stack) packages
import pandas as pd
import pyarrow.parquet as pq

from pathlib import Path
from tqdm import tqdm

In [2]:
DRP_VERSION = os.environ["DRP_VERSION"]
print(f"DRP_VERSION: {DRP_VERSION}")
raw_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning/raw/{DRP_VERSION}")
index_dir = raw_dir / "index"
index_dir.mkdir(parents=True, exist_ok=True)

In [3]:
def get_all_sizes(dataset_type):
    file_pointer = raw_dir / "paths" / f"{dataset_type}.txt"

    with file_pointer.open("r", encoding="utf8") as _text_file:
        paths = _text_file.readlines()
    paths = [path.strip() for path in paths]

    print(f"Found {len(paths)} files for {dataset_type}")

    ref_frame = pd.read_csv(raw_dir / "refs" / f"{dataset_type}.csv")
    ref_frame["path"] = paths

    num_columns = []
    num_rows = []
    num_row_groups = []
    file_size = []

    for path in tqdm(paths):
        parquet_md = pq.ParquetFile(path.strip()).metadata
        num_columns.append(parquet_md.num_columns)
        num_rows.append(parquet_md.num_rows)
        num_row_groups.append(parquet_md.num_row_groups)
        file_size.append(os.path.getsize(path))

    ref_frame["num_columns"] = num_columns
    ref_frame["num_rows"] = num_rows
    ref_frame["num_row_groups"] = num_row_groups
    ref_frame["file_size"] = file_size
    ref_frame["gbs"] = ref_frame["file_size"] / (1024 * 1024 * 1024)

    ref_frame.to_csv(raw_dir / "sizes" / f"{dataset_type}.csv", index=False)

In [4]:
dataset_types = [
    "dia_object",
    "dia_source",
    "dia_object_forced_source",
    "object",
    "source",
    "object_forced_source",
]

In [5]:
for set_type in dataset_types:
    get_all_sizes(set_type)

## Create batch files

In [6]:
dataset_dims = {
    "dia_object": ["tract"],
    "dia_source": ["tract"],
    "dia_object_forced_source": ["patch", "tract"],
    "object": ["tract"],
    "source": ["band", "day_obs", "physical_filter", "visit"],
    "object_forced_source": ["patch", "tract"],
}

dataset_groupby = {
    "dia_object": ["tract"],
    "dia_source": ["tract"],
    "dia_object_forced_source": ["tract"],
    "object": ["tract"],
    "source": ["band", "day_obs"],
    "object_forced_source": ["tract"],
}

In [7]:
def write_index_files(dataset_type):
    ref_frame = pd.read_csv(raw_dir / "sizes" / f"{dataset_type}.csv")
    desired_columns = dataset_dims[dataset_type] + ["path"]

    ref_frame = ref_frame[desired_columns]
    groups = ref_frame.groupby(dataset_groupby[dataset_type])
    (index_dir / dataset_type).mkdir(parents=True, exist_ok=True)
    counter = 0
    for _, value in groups:
        value.to_csv(index_dir / dataset_type / f"{counter:03d}.csv", index=False)
        counter += 1
    print("Wrote", counter, "index files for", dataset_type)

In [8]:
for set_type in dataset_types:
    write_index_files(set_type)

## Estimate the pixel thresholds.

Using something similar to [this old notebook](https://hats-import.readthedocs.io/en/latest/notebooks/estimate_pixel_threshold.html), but using the full dataset size and row count, we can get a pretty good idea of what good pixel thresholds are for each dataset.

In [9]:
def print_import_stats(dataset_type):
    all_sizes = pd.read_csv(raw_dir / "sizes" / f"{dataset_type}.csv")
    sample_file_size = all_sizes["file_size"].sum()
    num_rows = all_sizes["num_rows"].sum()

    ## 300MB
    ideal_file_small = 300 * 1024 * 1024
    ## 1G
    ideal_file_large = 1024 * 1024 * 1024

    threshold_small = ideal_file_small / sample_file_size * num_rows
    threshold_large = ideal_file_large / sample_file_size * num_rows

    print(dataset_type)
    print(f"  threshold between {int(threshold_small):_} and {int(threshold_large):_}")
    print(f'  total size_on_disk: {all_sizes["gbs"].sum():.2f} G')

In [10]:
for set_type in dataset_types:
    print_import_stats(set_type)