# Creating ways to genberate and filter PID lists from dashboard API

## generates a list of all pids between two times

In [23]:
import requests
import pandas as pd
def get_ifcb_bins_datetime_filtered(base_url: str,
                                    dataset: str,
                                    start: str,
                                    end: str) -> pd.DataFrame:
    """
    Get all bins for a dataset from /api/list_bins, then filter by sample_time
    between 'start' and 'end' (inclusive).

    Parameters
    ----------
    base_url : e.g. "https://habon-ifcb.whoi.edu"
    dataset  : e.g. "nauset"
    start, end : ISO-like datetime strings, e.g. "2025-06-25T09:00:00Z"

    Returns
    -------
    DataFrame with only bins in the requested time window.
    """
    base_url = base_url.rstrip("/")
    url = f"{base_url}/api/list_bins?dataset={dataset}"

    r = requests.get(url)
    r.raise_for_status()
    data = r.json()["data"]

    df = pd.DataFrame(data)

    # Adjust this column name if needed, but on HABON it should be 'sample_time'
    df['sample_time'] = pd.to_datetime(df["sample_time"])

    start_dt = pd.to_datetime(start)
    end_dt   = pd.to_datetime(end)

    mask = (df["sample_time"] >= start_dt) & (df["sample_time"] <= end_dt)
    return df.loc[mask].copy()


In [None]:
## example usage ##

###URL OPTIONS
## PERCY: http://percy.whoi.edu:8000
## HABON: https://habon-ifcb.whoi.edu

base_url = "https://habon-ifcb.whoi.edu"
dataset = "hablab_beadsExp"

start_dt = "2026-02-03T00:00:00Z"
end_dt   = "2026-02-26T12:00:00Z"


bins_df = get_ifcb_bins_datetime_filtered(base_url, dataset, start_dt, end_dt)
bins = bins_df["pid"].tolist()

print(f"Found {len(bins)} bins between {start_dt} and {end_dt}")
print(bins[:5])


# Pick PIDS near times across date range. Let's pick just a few per day over longer timeranges

In [18]:
import requests
import pandas as pd
from typing import List, Tuple, Optional

def get_ifcb_pids_by_daily_times(
    base_url: str,
    dataset: str,
    start: str,
    end: str,
    times: List[str],
    *,
    same_day_only: bool = True,
) -> Tuple[List[str], pd.DataFrame]:
    """
    For each day in [start, end], and for each time in `times` (HH:MM or HH:MM:SS),
    find the first bin with sample_time >= that target time.
    
    If same_day_only=True, matches that roll into the next day are dropped.

    Returns
    -------
    pids: list of selected pid strings (unique, in chronological order)
    picked_df: dataframe with target_time, sample_time, pid, and delta
    """
    base_url = base_url.rstrip("/")
    url = f"{base_url}/api/list_bins?dataset={dataset}"

    r = requests.get(url)
    r.raise_for_status()
    data = r.json()["data"]

    df = pd.DataFrame(data)

    if "sample_time" not in df.columns or "pid" not in df.columns:
        raise ValueError("Expected 'sample_time' and 'pid' columns from /api/list_bins response.")

    # Parse times
    df["sample_time"] = pd.to_datetime(df["sample_time"], utc=True, errors="coerce")
    df = df.dropna(subset=["sample_time"]).sort_values("sample_time")

    start_dt = pd.to_datetime(start, utc=True)
    end_dt   = pd.to_datetime(end,   utc=True)

    # Filter bins to time window (inclusive)
    df = df[(df["sample_time"] >= start_dt) & (df["sample_time"] <= end_dt)].copy()
    if df.empty:
        return [], pd.DataFrame(columns=["target_time", "sample_time", "pid", "delta"])

    # Build target schedule: every day x each desired time
    days = pd.date_range(start=start_dt.normalize(), end=end_dt.normalize(), freq="D", tz="UTC")
    # Convert "HH:MM[:SS]" strings to timedeltas
    tds = []
    for t in times:
        parts = t.split(":")
        if len(parts) == 2:
            hh, mm = parts
            ss = "0"
        elif len(parts) == 3:
            hh, mm, ss = parts
        else:
            raise ValueError(f"Time '{t}' must be 'HH:MM' or 'HH:MM:SS'")
        tds.append(pd.to_timedelta(f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d}"))

    targets = pd.DataFrame({
        "target_time": [d + td for d in days for td in tds]
    }).sort_values("target_time")

    # Keep only targets that fall within the overall start/end window
    targets = targets[(targets["target_time"] >= start_dt) & (targets["target_time"] <= end_dt)].copy()
    if targets.empty:
        return [], pd.DataFrame(columns=["target_time", "sample_time", "pid", "delta"])

    # Forward "asof" join: for each target_time, find first sample_time >= target_time
    bins_for_asof = df[["sample_time", "pid"]].rename(columns={"sample_time": "sample_time_match"})
    bins_for_asof = bins_for_asof.sort_values("sample_time_match")

    picked = pd.merge_asof(
        targets.sort_values("target_time"),
        bins_for_asof,
        left_on="target_time",
        right_on="sample_time_match",
        direction="forward",
        allow_exact_matches=True,
    )

    picked = picked.rename(columns={"sample_time_match": "sample_time"})
    picked = picked.dropna(subset=["sample_time", "pid"]).copy()

    # Optionally enforce same-day matches
    if same_day_only:
        picked = picked[picked["sample_time"].dt.normalize() == picked["target_time"].dt.normalize()].copy()

    picked["delta"] = picked["sample_time"] - picked["target_time"]

    # Optional: de-duplicate if two targets land on the same PID (happens if bins are sparse)
    picked = picked.sort_values("target_time")
    picked_unique = picked.drop_duplicates(subset=["pid"], keep="first").copy()

    pids = picked_unique["pid"].tolist()
    return pids, picked_unique[["target_time", "sample_time", "pid", "delta"]]

In [21]:
### Example Usage ###
pids, picked_df = get_ifcb_pids_by_daily_times(
    base_url="https://habon-ifcb.whoi.edu",
    dataset="mvco",
    start="2025-09-07T00:00:00Z",
    end="2025-10-26T23:59:59Z",
    times=["02:00", "08:00", "14:00", "20:00"],   # pick ~4 per day
    same_day_only=True
)

print(pids[:10])
print(picked_df.head(10))

['D20250907T021646_IFCB127', 'D20250907T081448_IFCB127', 'D20250907T141247_IFCB127', 'D20250907T201554_IFCB127', 'D20250908T021352_IFCB127', 'D20250908T081151_IFCB127', 'D20250908T140951_IFCB127', 'D20250908T201258_IFCB127', 'D20250909T021058_IFCB127', 'D20250909T080857_IFCB127']
                target_time               sample_time  \
0 2025-09-07 02:00:00+00:00 2025-09-07 02:16:46+00:00   
1 2025-09-07 08:00:00+00:00 2025-09-07 08:14:48+00:00   
2 2025-09-07 14:00:00+00:00 2025-09-07 14:12:47+00:00   
3 2025-09-07 20:00:00+00:00 2025-09-07 20:15:54+00:00   
4 2025-09-08 02:00:00+00:00 2025-09-08 02:13:52+00:00   
5 2025-09-08 08:00:00+00:00 2025-09-08 08:11:51+00:00   
6 2025-09-08 14:00:00+00:00 2025-09-08 14:09:51+00:00   
7 2025-09-08 20:00:00+00:00 2025-09-08 20:12:58+00:00   
8 2025-09-09 02:00:00+00:00 2025-09-09 02:10:58+00:00   
9 2025-09-09 08:00:00+00:00 2025-09-09 08:08:57+00:00   

                        pid           delta  
0  D20250907T021646_IFCB127 0 days 00:16:46  

In [24]:
## To save pidlist as json file 
import json
with open("../../IFCBData/PIDLists/mvcoTest.json", "w") as f:
    json.dump(pids, f)