In [1]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
import numpy as np
from tqdm import tqdm
import os
%matplotlib inline

In [2]:
n_mesh = 20000
PATH1 = f'scr_mk27/eclipse_grid_{n_mesh}'
PATH2 = f'scr_mk27/eclipse_grid_{n_mesh}_vbig'
PATH3 = f'scr_mk27/eclipse_grid_{n_mesh}_big'

In [3]:
import os
files_in_path = [os.path.join(PATH1, fname) for fname in os.listdir(PATH1) if fname.endswith('.pkl') or fname.endswith('.pickle')] \
    + \
    [os.path.join(PATH2, fname) for fname in os.listdir(PATH2) if fname.endswith('.pkl') or fname.endswith('.pickle')] + \
    [os.path.join(PATH3, fname) for fname in os.listdir(PATH3) if fname.endswith('.pkl') or fname.endswith('.pickle')]

print(len(files_in_path))


3292


In [8]:
import pickle
import numpy as np
import mmap

ln10 = np.log(10)
_SCALAR_TYPES = (int, float, str, np.number)

def fast_load(path):
    with open(path, "rb") as f:
        mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
        return pickle.loads(mm)

def process_pickle_file(pkfile):
    try:
        data = fast_load(pkfile)

        flux = data.get("fluxes_phoebe")
        bol  = data.get("bol_lum")

        # Fast in-place-ish transform
        logs = np.log(flux)
        logs /= ln10
        fluxes = -2.5 * logs

        f0 = fluxes[0]
        b0 = bol[0]
        metric = np.abs((fluxes - f0) - (bol - b0))

        scalars = {
            k: v for k, v in data.items()
            if isinstance(v, _SCALAR_TYPES)
        }

        return {
            "fluxes": fluxes,
            "metric": metric,
            **scalars,
        }, None

    except Exception:
        return None, None

results = [process_pickle_file(f) for f in tqdm(files_in_path)]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3292/3292 [31:29<00:00,  1.74it/s]


In [10]:
import pyarrow as pa
import pyarrow.parquet as pq
import os

# Filter out None results
flux_metric_records = [r for r, err in results if r is not None]
if flux_metric_records:
    # Find largest mesh size (if applicable, else arbitrary value)
    n_mesh = max((r.get('n_mesh', 0) for r in flux_metric_records), default=0)
    # Convert to pyarrow Table
    def convert_to_arrow_table(recs):
        # Find the longest arrays for fluxes/metric for padding
        max_flux_len = max((len(r['fluxes']) for r in recs), default=0)
        max_metric_len = max((len(r['metric']) for r in recs), default=0)
        fluxarr = []
        metricarr = []
        # Collect all parameter keys except 'fluxes', 'metric'
        param_keys = set()
        for r in recs:
            param_keys.update(
                k for k in r.keys() if k not in ('fluxes', 'metric') and isinstance(r[k], (int, float, str, np.integer, np.floating))
            )

        param_dict = {k: [] for k in param_keys}
        for r in recs:
            # Pad arrays to common length
            fluxvec = np.asarray(r['fluxes'])
            metvec = np.asarray(r['metric'])
            if fluxvec.shape[0] < max_flux_len:
                fluxvec = np.pad(fluxvec, (0, max_flux_len - fluxvec.shape[0]), constant_values=np.nan)
            if metvec.shape[0] < max_metric_len:
                metvec = np.pad(metvec, (0, max_metric_len - metvec.shape[0]), constant_values=np.nan)
            fluxarr.append(fluxvec)
            metricarr.append(metvec)
            # For each parameter, pull value if present, else np.nan or ''
            for k in param_keys:
                v = r.get(k, np.nan if k not in ('file',) else "")
                param_dict[k].append(v)

        arrays = {
            'fluxes': fluxarr,
            'metric': metricarr
        }
        # Add all parameters except filename (do not include basename or file path)
        arrays.update(param_dict)
        tbl = pa.table(arrays)
        return tbl

    tbl = convert_to_arrow_table(flux_metric_records)
    parquet_path = os.path.join('scr_mk27', f"fluxes_and_metric_20000.parquet")
    pq.write_table(tbl, parquet_path)
    print("Fluxes and metric saved to:", parquet_path)
else:
    print("No valid records to save to parquet.")


Fluxes and metric saved to: scr_mk27/fluxes_and_metric_20000.parquet


In [None]:
# Prepare and save as parquet: fluxes and metric may be arrays, so use pyarrow Table
# Convert lists of dicts to pyarrow Table
def convert_to_arrow_table(recs):
    # Find the longest arrays for fluxes/metric for padding
    max_flux_len = max((len(r['fluxes']) for r in recs), default=0)
    max_metric_len = max((len(r['metric']) for r in recs), default=0)
    fluxarr = []
    metricarr = []
    # Collect all parameter keys except 'fluxes', 'metric'
    param_keys = set()
    for r in recs:
        param_keys.update(
            k for k in r.keys() if k not in ('fluxes', 'metric') and isinstance(r[k], (int, float, str, np.integer, np.floating))
        )

    param_dict = {k: [] for k in param_keys}
    for r in recs:
        # Pad arrays to common length
        fluxvec = np.asarray(r['fluxes'])
        metvec = np.asarray(r['metric'])
        if fluxvec.shape[0] < max_flux_len:
            fluxvec = np.pad(fluxvec, (0, max_flux_len - fluxvec.shape[0]), constant_values=np.nan)
        if metvec.shape[0] < max_metric_len:
            metvec = np.pad(metvec, (0, max_metric_len - metvec.shape[0]), constant_values=np.nan)
        fluxarr.append(fluxvec)
        metricarr.append(metvec)
        # For each parameter, pull value if present, else np.nan or ''
        for k in param_keys:
            v = r.get(k, np.nan if k not in ('file',) else "")
            param_dict[k].append(v)

    arrays = {
        'fluxes': fluxarr,
        'metric': metricarr
    }
    # Add all parameters except filename (do not include basename or file path)
    arrays.update(param_dict)
    tbl = pa.table(arrays)
    return tbl

tbl = convert_to_arrow_table(flux_metric_records)
parquet_path = os.path.join('scr_mk27', f"fluxes_and_metric_{n_mesh}.parquet")
pq.write_table(tbl, parquet_path)


print("Fluxes and metric saved to:", parquet_path)


Fluxes and metric saved to: scr_mk27/fluxes_and_metric_5000.parquet
