# Raw file sizes

Author: Melissa

Explore the raw file sizes of the parquet files that back butler queries.

This isn't super-useful, but I'm waiting for some slow import steps and I'm having fun.



In [1]:
# Generic python packages
import os
import pylab as plt

# LSST Science Pipelines (Stack) packages
import lsst.afw.display as afwDisplay
import pandas as pd
import pyarrow.parquet as pq

from pathlib import Path
from tqdm import tqdm

# Set a standard figure size to use
plt.rcParams['figure.figsize'] = (8.0, 8.0)
afwDisplay.setDefaultBackend('matplotlib')

In [2]:
raw_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_04/raw")

In [3]:
def get_all_sizes(dataset_type):
    file_pointer = raw_dir / "paths" / f"{dataset_type}.txt"
    
    with file_pointer.open("r", encoding="utf8") as _text_file:
        paths = _text_file.readlines()
    paths = [path.strip() for path in paths]

    print(f"Found {len(paths)} files for {dataset_type}")

    ref_frame = pd.read_csv(raw_dir / "refs" / f"{dataset_type}.csv") 
    ref_frame["paths"] = paths
    
    num_columns = []
    num_rows = []
    num_row_groups = []
    file_size = []
    
    for path in tqdm(paths):
        parquet_md = pq.ParquetFile(path.strip()).metadata
        num_columns.append(parquet_md.num_columns)
        num_rows.append(parquet_md.num_rows)
        num_row_groups.append(parquet_md.num_row_groups)
        file_size.append(os.path.getsize(path))
    
    ref_frame["num_columns"] = num_columns
    ref_frame["num_rows"] = num_rows
    ref_frame["num_row_groups"] = num_row_groups
    ref_frame["file_size"] = file_size
    ref_frame["gbs"] = ref_frame["file_size"] / (1024 * 1024 * 1024)
    
    ref_frame.to_csv(raw_dir / "sizes" / f"{dataset_type}.csv", index=False)

In [4]:
dataset_types = [
    'diaObjectTable_tract',
    'diaSourceTable_tract',
    'forcedSourceOnDiaObjectTable',
    'objectTable',
    'sourceTable',
    'forcedSourceTable'
]

In [5]:
for set_type in dataset_types:
    get_all_sizes(set_type)

## Estimate the pixel thresholds.

Using something similar to [this old notebook](https://hats-import.readthedocs.io/en/latest/notebooks/estimate_pixel_threshold.html), but using the full dataset size and row count, we can get a pretty good idea of what good pixel thresholds are for each dataset.

In [6]:
def print_import_stats(dataset_type):
    all_sizes = pd.read_csv(raw_dir / "sizes" / f"{dataset_type}.csv")
    sample_file_size=all_sizes["file_size"].sum()
    num_rows=all_sizes["num_rows"].sum()
    
    ## 300MB
    ideal_file_small = 300 * 1024 * 1024
    ## 1G
    ideal_file_large = 1024 * 1024 * 1024
    
    threshold_small = ideal_file_small / sample_file_size * num_rows
    threshold_large = ideal_file_large / sample_file_size * num_rows

    print(dataset_type)
    print(f"  threshold between {int(threshold_small):_} and {int(threshold_large):_}")
    print(f'  total size_on_disk: {all_sizes["gbs"].sum():.2f} G')

In [7]:
for set_type in dataset_types:
    print_import_stats(set_type)