### Ingests a single set of ADC, HDR and Class files 

Takes the required info from each file and creates single merged file 
Logic
1) Parse ADCFileFormat from .hdr to get column names
2) Load .adc with those headers
3) Add RoiNumber to ADC as 1 - N
4) Remove rows with RoiX=RoiY=RoiHeight=RoiWidth=0 -- this removes zero roi triggers so we can merge with class file on roi number
5) Add InhibitTimeDiff = diff(InhibitTime).fillna(0) -- useful for understanding sample density
6) Add VolumeAnalyzed = (RunTime - InhibitTime) / 240  -- needed for concentration estimates and also useful for understanding sample density
7) Load class CSV and extract RoiNumber from pid ('..._00023' -> 23) -- used to merge with adc file
8) Merge class_df with ADC-derived columns on RoiNumber
9) Return merged_df (and adc_df, class_df)

This gives you dataset of all rois with class scores and associated ADC metadata

In [7]:
import pandas as pd
from pathlib import Path

def ingest_ifcb(adc_path: str,
                hdr_path: str,
                class_csv_path: str,
                drop_zero_roi: bool = True):
    """
    Steps:
      1) Parse ADCFileFormat from .hdr to get column names
      2) Load .adc with those headers
      3) Add RoiNumber to ADC as 1..N
      4) Remove rows with RoiX=RoiY=RoiHeight=RoiWidth=0 (optionally)
      5) Add InhibitTimeDiff = diff(InhibitTime).fillna(0)
      6) Add VolumeAnalyzed = (RunTime - InhibitTime) / 240
      7) Load class CSV and extract RoiNumber from pid ('..._00023' -> 23)
      8) Merge class_df with ADC-derived columns on RoiNumber
      9) Return merged_df (and adc_df, class_df)
    """
    adc_path = Path(adc_path)
    hdr_path = Path(hdr_path)
    class_path = Path(class_csv_path)

    # 1) Parse ADCFileFormat from .hdr
    headers = None
    with open(hdr_path, 'r') as f:
        for line in f:
            if line.startswith("ADCFileFormat:"):
                headers = [h.strip() for h in line.split(":", 1)[1].split(",")]
                break
    if not headers:
        raise ValueError("ADCFileFormat not found in header file.")

    # 2) Load .adc with headers
    adc_df = pd.read_csv(adc_path, header=None)
    adc_df.columns = headers[:adc_df.shape[1]]

    # 3) Add RoiNumber to ADC as 1..N (do NOT change after this)
    adc_df["RoiNumber"] = range(1, len(adc_df) + 1)

    # 4) Remove zero-ROI rows (preserving original RoiNumber values)
    if drop_zero_roi:
        roi_cols = ['RoiX', 'RoiY', 'RoiHeight', 'RoiWidth']
        if all(col in adc_df.columns for col in roi_cols):
            keep_mask = ~((adc_df['RoiX'] == 0) &
                          (adc_df['RoiY'] == 0) &
                          (adc_df['RoiHeight'] == 0) &
                          (adc_df['RoiWidth'] == 0))
            adc_df = adc_df.loc[keep_mask]  # keep original RoiNumber; don't reset index

    # 5) InhibitTimeDiff
    if 'InhibitTime' in adc_df.columns:
        # ensure numeric in case strings slipped in
        adc_df['InhibitTime'] = pd.to_numeric(adc_df['InhibitTime'], errors='coerce')
        adc_df['InhibitTimeDiff'] = adc_df['InhibitTime'].diff().fillna(0)
    else:
        adc_df['InhibitTimeDiff'] = pd.NA

    # 6) VolumeAnalyzed
    if {'RunTime', 'InhibitTime'}.issubset(adc_df.columns):
        adc_df['RunTime'] = pd.to_numeric(adc_df['RunTime'], errors='coerce')
        adc_df['VolumeAnalyzed'] = (adc_df['RunTime'] - adc_df['InhibitTime']) / 240
    else:
        adc_df['VolumeAnalyzed'] = pd.NA

    # 7) Load class CSV + extract RoiNumber from pid
    class_df = pd.read_csv(class_path)
    if 'pid' not in class_df.columns:
        raise ValueError("Expected 'pid' column in class CSV to extract RoiNumber.")
    class_df['RoiNumber'] = class_df['pid'].str.split('_').str[-1].astype(int)

    # 8) Merge on RoiNumber
    cols_to_keep = ['RoiNumber', 'RunTime', 'InhibitTime', 'InhibitTimeDiff', 'VolumeAnalyzed']
    for extra in ['RoiHeight', 'RoiWidth', 'RoiX', 'RoiY']:
        if extra in adc_df.columns:
            cols_to_keep.append(extra)

    merged_df = class_df.merge(adc_df[cols_to_keep], on='RoiNumber', how='left')

    # 9) Return
    return merged_df, adc_df, class_df

## Loops over a directory and outputs a merged directory

wrapper for the ingest_ifcb function that merges based on the initial string of the file names and writes out merged datasets to a new directory

In [13]:
import pandas as pd
from pathlib import Path
from typing import Dict

def ingest_ifcb_directory(directory: str,
                          drop_zero_roi: bool = True,
                          save_path: str | None = None) -> Dict[str, pd.DataFrame]:
    """
    Loop over a directory, find sets of .adc, .hdr, and class CSV files
    that share the same IFCB run prefix, and return merged dataframes.

    Args:
        directory: Folder containing IFCB .adc, .hdr, and class CSV files.
        drop_zero_roi: Passed to ingest_ifcb().
        save_path: Optional folder to save merged CSVs.
                   If None, files are saved in the source directory.

    Returns:
        dict: { prefix : merged_dataframe }
    """

    directory = Path(directory)
    save_dir = Path(save_path) if save_path else directory
    save_dir.mkdir(parents=True, exist_ok=True)

    adc_files = list(directory.glob("*.adc"))
    hdr_files = list(directory.glob("*.hdr"))
    class_files = list(directory.glob("*class*.csv"))

    # Build file lookup maps
    adc_map   = {f.stem: f for f in adc_files}
    hdr_map   = {f.stem: f for f in hdr_files}
    class_map = {f.stem.replace("_class_vNone", "").replace("_class", ""): f
                 for f in class_files}

    prefixes = set(adc_map.keys()) | set(hdr_map.keys()) | set(class_map.keys())

    merged_results = {}

    for prefix in sorted(prefixes):
        adc_path   = adc_map.get(prefix)
        hdr_path   = hdr_map.get(prefix)
        class_path = class_map.get(prefix)

        # Require complete sets
        if not (adc_path and hdr_path and class_path):
            print(f"Skipping {prefix}: incomplete set of files.")
            continue

        print(f"Processing {prefix}...")

        merged_df, adc_df, class_df = ingest_ifcb(
            adc_path=adc_path,
            hdr_path=hdr_path,
            class_csv_path=class_path,
            drop_zero_roi=drop_zero_roi
        )

        merged_results[prefix] = merged_df

        # Save output to designated location
        outfile = save_dir / f"{prefix}_merged.csv"
        merged_df.to_csv(outfile, index=False)
        print(f"Saved merged file → {outfile}")

    return merged_results


## Testing that it works 
Build a directory of files you want to merge
Create a directory to save the new merged files into 
run 
test that the files look the way they should

In [14]:
test_dir = "../../IFCBData/CosmicData/"
merged_dir = "../../IFCBData/CosmicData/mergedData/"

In [17]:
merged_dict = ingest_ifcb_directory(directory= test_dir,
                                   save_path= merged_dir)

Processing D20250827T182211_IFCB144...
Saved merged file → ../../IFCBData/CosmicData/mergedData/D20250827T182211_IFCB144_merged.csv
Processing D20250827T184541_IFCB144...
Saved merged file → ../../IFCBData/CosmicData/mergedData/D20250827T184541_IFCB144_merged.csv
Processing D20250827T190910_IFCB144...
Saved merged file → ../../IFCBData/CosmicData/mergedData/D20250827T190910_IFCB144_merged.csv
Processing D20250827T193241_IFCB144...
Saved merged file → ../../IFCBData/CosmicData/mergedData/D20250827T193241_IFCB144_merged.csv
Processing D20250827T195610_IFCB144...
Saved merged file → ../../IFCBData/CosmicData/mergedData/D20250827T195610_IFCB144_merged.csv
Processing D20250827T201941_IFCB144...
Saved merged file → ../../IFCBData/CosmicData/mergedData/D20250827T201941_IFCB144_merged.csv
Processing D20250827T204310_IFCB144...
Saved merged file → ../../IFCBData/CosmicData/mergedData/D20250827T204310_IFCB144_merged.csv
Processing D20250827T210640_IFCB144...
Saved merged file → ../../IFCBData/Co

In [22]:
test_df = pd.read_csv( "../../IFCBData/CosmicData/mergedData/D20250827T182211_IFCB144_merged.csv")

In [23]:
print(test_df.head)

<bound method NDFrame.head of                                  pid    Acantharia  Acanthoica_quattrospina  \
0     D20250827T182211_IFCB144_00002  1.000000e-07             1.675000e-04   
1     D20250827T182211_IFCB144_00003  6.000000e-08             0.000000e+00   
2     D20250827T182211_IFCB144_00004  0.000000e+00             0.000000e+00   
3     D20250827T182211_IFCB144_00005  0.000000e+00             0.000000e+00   
4     D20250827T182211_IFCB144_00006  0.000000e+00             0.000000e+00   
...                              ...           ...                      ...   
5118  D20250827T182211_IFCB144_05208  0.000000e+00             5.400000e-07   
5119  D20250827T182211_IFCB144_05209  0.000000e+00             9.660000e-06   
5120  D20250827T182211_IFCB144_05210  0.000000e+00             2.754000e-03   
5121  D20250827T182211_IFCB144_05211  0.000000e+00             0.000000e+00   
5122  D20250827T182211_IFCB144_05212  1.000000e-07             2.736000e-05   

          Akashiwo  A