In [1]:
from pathlib import Path
import pandas as pd

#----- Start the data transformation by adding "holding_status" -----#

def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    """
    Starting at cwd, walk up to `max_up` parents to find `rel_path`.
    Returns the resolved path if found; raises FileNotFoundError otherwise.
    """
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    # Helpful diagnostics
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

before_rows = len(df)

# Normalize exit_date blanks to NA (handles "", "NaT" strings)
if "exit_date" not in df.columns:
    raise KeyError("Column 'exit_date' not found in working.csv. Run the earlier ADD_COLUMNS step from 'deal' first.")
norm = df["exit_date"].copy()
if norm.dtype == object:
    norm = norm.replace({"": pd.NA, "NaT": pd.NA, "nat": pd.NA, "None": pd.NA})

# Determine holding_status: any non-null exit_date => exited; else unexited
is_exited = norm.notna()
df["holding_status"] = is_exited.map({True: "exited", False: "unexited"})

# Persist
df.to_csv(TARGET_CSV, index=False)
print("Added column: holding_status")

import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})

# Row-level coverage
exited_count = (check["holding_status"] == "exited").sum()
unexited_count = (check["holding_status"] == "unexited").sum()
total_after = len(check)
assert exited_count + unexited_count == total_after, "Status coverage failed: counts don't add up to total rows."

# Deal-level consistency: each deal_id should have a single status
status_per_deal = check.groupby("deal_id")["holding_status"].nunique(dropna=False)
mixed = status_per_deal[status_per_deal > 1]
assert mixed.empty, f"{len(mixed)} deal_id(s) have mixed exited/unexited rows."

print(f"Check passed. exited={exited_count}, unexited={unexited_count}, total_rows={total_after}")


Added column: holding_status
Check passed. exited=2511, unexited=2445, total_rows=4956
