### Ingests a single set of ADC, HDR and Class files 

Takes the required info from each file and creates single merged file 
Logic
1) Parse ADCFileFormat from .hdr to get column names
2) Load .adc with those headers
3) Add RoiNumber to ADC as 1 - N
4) Remove rows with RoiX=RoiY=RoiHeight=RoiWidth=0 -- this removes zero roi triggers so we can merge with class file on roi number
5) Add InhibitTimeDiff = diff(InhibitTime).fillna(0) -- useful for understanding sample density
6) Add VolumeAnalyzed = (RunTime - InhibitTime) / 240  -- needed for concentration estimates and also useful for understanding sample density
7) Load class CSV and extract RoiNumber from pid ('..._00023' -> 23) -- used to merge with adc file
8) Merge class_df with ADC-derived columns on RoiNumber
9) Return merged_df (and adc_df, class_df)

This gives you dataset of all rois with class scores and associated ADC metadata

In [28]:
import pandas as pd
from pathlib import Path

def ingest_ifcb(adc_path: str,
                hdr_path: str,
                class_csv_path: str,
                drop_zero_roi: bool = True):
    """
    Steps:
      1) Parse ADCFileFormat from .hdr to get column names
      2) Load .adc with those headers
      3) Add RoiNumber to ADC as 1..N
      4) Remove rows with RoiX=RoiY=RoiHeight=RoiWidth=0 (optionally)
      5) Add InhibitTimeDiff = diff(InhibitTime).fillna(0)
      6) Add VolumeAnalyzed = (RunTime - InhibitTime) / 240
      7) Load class CSV and extract RoiNumber from pid ('..._00023' -> 23)
      8) Merge class_df with ADC-derived columns on RoiNumber
      9) Return merged_df (and adc_df, class_df)
    """
    adc_path = Path(adc_path)
    hdr_path = Path(hdr_path)
    class_path = Path(class_csv_path)

    # 1) Parse ADCFileFormat from .hdr
    headers = None
    with open(hdr_path, 'r') as f:
        for line in f:
            if line.startswith("ADCFileFormat:"):
                headers = [h.strip() for h in line.split(":", 1)[1].split(",")]
                break
    if not headers:
        raise ValueError("ADCFileFormat not found in header file.")

    # 2) Load .adc with headers
    adc_df = pd.read_csv(adc_path, header=None)
    adc_df.columns = headers[:adc_df.shape[1]]

    # 3) Add RoiNumber to ADC as 1..N (do NOT change after this)
    adc_df["RoiNumber"] = range(1, len(adc_df) + 1)

    # 4) Remove zero-ROI rows (preserving original RoiNumber values)
    if drop_zero_roi:
        roi_cols = ['RoiX', 'RoiY', 'RoiHeight', 'RoiWidth']
        if all(col in adc_df.columns for col in roi_cols):
            keep_mask = ~((adc_df['RoiX'] == 0) &
                          (adc_df['RoiY'] == 0) &
                          (adc_df['RoiHeight'] == 0) &
                          (adc_df['RoiWidth'] == 0))
            adc_df = adc_df.loc[keep_mask]  # keep original RoiNumber; don't reset index

    # 5) InhibitTimeDiff
    if 'InhibitTime' in adc_df.columns:
        # ensure numeric in case strings slipped in
        adc_df['InhibitTime'] = pd.to_numeric(adc_df['InhibitTime'], errors='coerce')
        adc_df['InhibitTimeDiff'] = adc_df['InhibitTime'].diff().fillna(0)
    else:
        adc_df['InhibitTimeDiff'] = pd.NA

    # 6) VolumeAnalyzed
    if {'RunTime', 'InhibitTime'}.issubset(adc_df.columns):
        adc_df['RunTime'] = pd.to_numeric(adc_df['RunTime'], errors='coerce')
        adc_df['VolumeAnalyzed'] = (adc_df['RunTime'] - adc_df['InhibitTime']) / 240
    else:
        adc_df['VolumeAnalyzed'] = pd.NA

    # 7) Load class CSV + extract RoiNumber from pid
    class_df = pd.read_csv(class_path)
    if 'pid' not in class_df.columns:
        raise ValueError("Expected 'pid' column in class CSV to extract RoiNumber.")
    class_df['RoiNumber'] = class_df['pid'].str.split('_').str[-1].astype(int)

    # 8) Merge on RoiNumber
    cols_to_keep = ['RoiNumber', 'RunTime', 'InhibitTime', 'InhibitTimeDiff', 'VolumeAnalyzed'] ## can add or remove col here depending on what you want
    for extra in ['RoiHeight', 'RoiWidth', 'RoiX', 'RoiY']:
        if extra in adc_df.columns:
            cols_to_keep.append(extra)

    merged_df = class_df.merge(adc_df[cols_to_keep], on='RoiNumber', how='left')

    # 9) Return
    return merged_df, adc_df, class_df

## Loops over a directory and outputs a merged directory

wrapper for the ingest_ifcb function that merges based on the initial string of the file names and writes out merged datasets to a new directory

In [25]:
import pandas as pd
from pathlib import Path
from typing import Dict

def ingest_ifcb_directory(directory: str,
                          drop_zero_roi: bool = True,
                          save_path: str | None = None) -> Dict[str, pd.DataFrame]:
    """
    Loop over a directory, find sets of .adc, .hdr, and class CSV files
    that share the same IFCB run prefix, and return merged dataframes.

    Args:
        directory: Folder containing IFCB .adc, .hdr, and class CSV files.
        drop_zero_roi: Passed to ingest_ifcb().
        save_path: Optional folder to save merged CSVs.
                   If None, files are saved in the source directory.

    Returns:
        dict: { prefix : merged_dataframe }
    """

    directory = Path(directory)
    save_dir = Path(save_path) if save_path else directory
    save_dir.mkdir(parents=True, exist_ok=True)

    adc_files = list(directory.glob("*.adc"))
    hdr_files = list(directory.glob("*.hdr"))
    class_files = list(directory.glob("*class*.csv"))

    # Build file lookup maps
    adc_map   = {f.stem: f for f in adc_files}
    hdr_map   = {f.stem: f for f in hdr_files}
    class_map = {f.stem.replace("_class_vNone", "").replace("_class", ""): f
                 for f in class_files}

    prefixes = set(adc_map.keys()) | set(hdr_map.keys()) | set(class_map.keys())

    merged_results = {}

    for prefix in sorted(prefixes):
        adc_path   = adc_map.get(prefix)
        hdr_path   = hdr_map.get(prefix)
        class_path = class_map.get(prefix)

        # Require complete sets
        if not (adc_path and hdr_path and class_path):
            print(f"Skipping {prefix}: incomplete set of files.")
            continue

        print(f"Processing {prefix}...")

        merged_df, adc_df, class_df = ingest_ifcb(
            adc_path=adc_path,
            hdr_path=hdr_path,
            class_csv_path=class_path,
            drop_zero_roi=drop_zero_roi
        )

        merged_results[prefix] = merged_df

        # Save output to designated location
        outfile = save_dir / f"{prefix}_merged.csv"
        merged_df.to_csv(outfile, index=False)
        print(f"Saved merged file â†’ {outfile}")

    return merged_results


## Testing that it works 
Build a directory of files you want to merge
Create a directory to save the new merged files into 
run 
test that the files look the way they should

In [29]:
test_dir = "../../IFCBData/spawn/zygotes/"
merged_dir = "../../IFCBData/spawn/zygotes/merged"

In [38]:
merged_dict = ingest_ifcb_directory(directory= test_dir,
                                   save_path= merged_dir)

Processing D20240501T200201_IFCB145 (class=no)...
Saved: ../../IFCBData/spawn/zygotes/merged/D20240501T200201_IFCB145_adc_only.csv


In [39]:
test_df = pd.read_csv("../../IFCBData/spawn/zygotes/merged/D20240501T200201_IFCB145_adc_only.csv")

In [40]:
print(test_df.head)

<bound method NDFrame.head of       RoiNumber      ADCtime      RunTime  InhibitTime  InhibitTimeDiff  \
0             2     4.931790     4.953893     0.083047         0.000000   
1             3     5.104920     5.127059     0.166803         0.083757   
2             4     5.774158     5.796293     0.249392         0.082589   
3             5     5.865553     5.888199     0.333301         0.083908   
4             6     5.952413     5.972958     0.414779         0.081478   
...         ...          ...          ...          ...              ...   
6378       6786  1198.809496  1198.831528   517.552604         0.083160   
6379       6787  1199.135671  1199.157500   517.634792         0.082187   
6380       6789  1199.493438  1199.514583   517.801215         0.166424   
6381       6790  1200.150306  1200.174167   517.885764         0.084549   
6382       6791  1200.525941  1200.548333   517.968507         0.082743   

      VolumeAnalyzed  RoiHeight  RoiWidth  RoiX  RoiY  
0           0

## USE THIS VERSION: Class file optional Versions -- same logic just now can handle missing class files 

In [41]:
import pandas as pd
from pathlib import Path
from typing import Optional, Tuple

def ingest_ifcb(adc_path: str,
                hdr_path: str,
                class_csv_path: Optional[str] = None,
                drop_zero_roi: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:
    """
    Ingest IFCB ADC+HDR, optionally merge with a class CSV (if provided and exists).

    Returns:
        merged_df: 
            - if class provided & exists: class_df merged with adc-derived columns on RoiNumber
            - else: adc-derived dataframe containing RoiNumber + derived columns (and ROI geometry if present)
        adc_df: ADC dataframe with parsed headers + derived columns
        class_df: class dataframe if loaded, else None
    """
    adc_path = Path(adc_path)
    hdr_path = Path(hdr_path)
    class_path = Path(class_csv_path) if class_csv_path else None

    # 1) Parse ADCFileFormat from .hdr
    headers = None
    with open(hdr_path, 'r') as f:
        for line in f:
            if line.startswith("ADCFileFormat:"):
                headers = [h.strip() for h in line.split(":", 1)[1].split(",")]
                break
    if not headers:
        raise ValueError(f"ADCFileFormat not found in header file: {hdr_path}")

    # 2) Load .adc with headers
    adc_df = pd.read_csv(adc_path, header=None)
    adc_df.columns = headers[:adc_df.shape[1]]

    # 3) Add RoiNumber to ADC as 1..N (do NOT change after this)
    adc_df["RoiNumber"] = range(1, len(adc_df) + 1)

    # 4) Remove zero-ROI rows (preserving original RoiNumber values)
    if drop_zero_roi:
        roi_cols = ['RoiX', 'RoiY', 'RoiHeight', 'RoiWidth']
        if all(col in adc_df.columns for col in roi_cols):
            keep_mask = ~((adc_df['RoiX'] == 0) &
                          (adc_df['RoiY'] == 0) &
                          (adc_df['RoiHeight'] == 0) &
                          (adc_df['RoiWidth'] == 0))
            adc_df = adc_df.loc[keep_mask]  # keep original RoiNumber; don't reset index

    # 5) InhibitTimeDiff
    if 'InhibitTime' in adc_df.columns:
        adc_df['InhibitTime'] = pd.to_numeric(adc_df['InhibitTime'], errors='coerce')
        adc_df['InhibitTimeDiff'] = adc_df['InhibitTime'].diff().fillna(0)
    else:
        adc_df['InhibitTimeDiff'] = pd.NA

    # 6) VolumeAnalyzed
    if {'RunTime', 'InhibitTime'}.issubset(adc_df.columns):
        adc_df['RunTime'] = pd.to_numeric(adc_df['RunTime'], errors='coerce')
        adc_df['VolumeAnalyzed'] = (adc_df['RunTime'] - adc_df['InhibitTime']) / 240
    else:
        adc_df['VolumeAnalyzed'] = pd.NA

    # Columns to expose from ADC side
    cols_to_keep = ['RoiNumber','ADCtime', 'RunTime', 'InhibitTime', 'InhibitTimeDiff', 'VolumeAnalyzed'] ## can add more here if you want other things not sure why chat started cutting stuff out
    for extra in ['RoiHeight', 'RoiWidth', 'RoiX', 'RoiY']:
        if extra in adc_df.columns:
            cols_to_keep.append(extra)

    adc_out = adc_df[cols_to_keep].copy()

    # 7-8) Optional: Load class CSV + merge
    class_df = None
    if class_path and class_path.exists():
        class_df = pd.read_csv(class_path)

        if 'pid' not in class_df.columns:
            raise ValueError(f"Expected 'pid' column in class CSV to extract RoiNumber: {class_path}")

        class_df['RoiNumber'] = class_df['pid'].str.split('_').str[-1].astype(int)

        merged_df = class_df.merge(adc_out, on='RoiNumber', how='left')
        return merged_df, adc_df, class_df

    # No class file: return ADC-derived table as the "merged" output
    return adc_out, adc_df, None


In [42]:
from pathlib import Path
from typing import Dict, Optional

def ingest_ifcb_directory(directory: str,
                          drop_zero_roi: bool = True,
                          save_path: Optional[str] = None,
                          class_suffixes: tuple = ("_class_vNone.csv", "_class.csv"),
                          adc_only_suffix: str = "_adc_only.csv") -> Dict[str, pd.DataFrame]:
    """
    Process a folder of IFCB files. For each prefix:
      - requires .adc and .hdr
      - class CSV is optional
    Writes outputs to save_path (or the input directory if None).

    Returns:
      dict {prefix: output_df}
    """
    directory = Path(directory)
    save_dir = Path(save_path) if save_path else directory
    save_dir.mkdir(parents=True, exist_ok=True)

    adc_map = {p.stem: p for p in directory.glob("*.adc")}
    hdr_map = {p.stem: p for p in directory.glob("*.hdr")}

    # Build a class map keyed by prefix, supporting multiple suffix patterns
    class_map: Dict[str, Path] = {}
    for p in directory.glob("*class*.csv"):
        name = p.name
        prefix = None
        for suf in class_suffixes:
            if name.endswith(suf):
                prefix = name[:-len(suf)]
                break
        if prefix:
            class_map[prefix] = p

    prefixes = sorted(set(adc_map) | set(hdr_map) | set(class_map))

    results: Dict[str, pd.DataFrame] = {}

    for prefix in prefixes:
        adc_path = adc_map.get(prefix)
        hdr_path = hdr_map.get(prefix)

        if not (adc_path and hdr_path):
            print(f"Skipping {prefix}: missing .adc or .hdr")
            continue

        class_path = class_map.get(prefix)  # may be None

        print(f"Processing {prefix} (class={'yes' if class_path else 'no'})...")

        out_df, adc_df, class_df = ingest_ifcb(
            adc_path=str(adc_path),
            hdr_path=str(hdr_path),
            class_csv_path=str(class_path) if class_path else None,
            drop_zero_roi=drop_zero_roi
        )

        results[prefix] = out_df

        # Save with different suffix depending on whether class exists
        if class_df is not None:
            outfile = save_dir / f"{prefix}_merged.csv"
        else:
            outfile = save_dir / f"{prefix}{adc_only_suffix}"

        out_df.to_csv(outfile, index=False)
        print(f"Saved: {outfile}")

    return results


## Testing that it works 
Build a directory of files you want to merge
Create a directory to save the new merged files into 
run 
test that the files look the way they should

In [43]:
test_dir = "../../IFCBData/spawn/zygotes/"
merged_dir = "../../IFCBData/spawn/zygotes/merged"

In [44]:
merged_dict = ingest_ifcb_directory(directory= test_dir,
                                   save_path= merged_dir)

Processing D20240501T182329_IFCB145 (class=no)...
Saved: ../../IFCBData/spawn/zygotes/merged/D20240501T182329_IFCB145_adc_only.csv
Processing D20240501T185545_IFCB145 (class=no)...
Saved: ../../IFCBData/spawn/zygotes/merged/D20240501T185545_IFCB145_adc_only.csv
Processing D20240501T200201_IFCB145 (class=no)...
Saved: ../../IFCBData/spawn/zygotes/merged/D20240501T200201_IFCB145_adc_only.csv


In [39]:
test_df = pd.read_csv("../../IFCBData/spawn/zygotes/merged/D20240501T200201_IFCB145_adc_only.csv")

In [40]:
print(test_df.head)

<bound method NDFrame.head of       RoiNumber      ADCtime      RunTime  InhibitTime  InhibitTimeDiff  \
0             2     4.931790     4.953893     0.083047         0.000000   
1             3     5.104920     5.127059     0.166803         0.083757   
2             4     5.774158     5.796293     0.249392         0.082589   
3             5     5.865553     5.888199     0.333301         0.083908   
4             6     5.952413     5.972958     0.414779         0.081478   
...         ...          ...          ...          ...              ...   
6378       6786  1198.809496  1198.831528   517.552604         0.083160   
6379       6787  1199.135671  1199.157500   517.634792         0.082187   
6380       6789  1199.493438  1199.514583   517.801215         0.166424   
6381       6790  1200.150306  1200.174167   517.885764         0.084549   
6382       6791  1200.525941  1200.548333   517.968507         0.082743   

      VolumeAnalyzed  RoiHeight  RoiWidth  RoiX  RoiY  
0           0