In [128]:
from pathlib import Path
import pandas as pd

#----- Start the data transformation by adding "holding_status" -----#

def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    # Helpful diagnostics
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

before_rows = len(df)

# Normalize exit_date blanks to NA (handles "", "NaT" strings)
if "exit_date" not in df.columns:
    raise KeyError("Column 'exit_date' not found in working.csv. Run the earlier ADD_COLUMNS step from 'deal' first.")
norm = df["exit_date"].copy()
if norm.dtype == object:
    norm = norm.replace({"": pd.NA, "NaT": pd.NA, "nat": pd.NA, "None": pd.NA})

# Determine holding_status: any non-null exit_date => exited; else unexited
is_exited = norm.notna()
df["holding_status"] = is_exited.map({True: "exited", False: "unexited"})

# Persist
df.to_csv(TARGET_CSV, index=False)
print("Added column: holding_status")

import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})

# Row-level coverage
exited_count = (check["holding_status"] == "exited").sum()
unexited_count = (check["holding_status"] == "unexited").sum()
total_after = len(check)
assert exited_count + unexited_count == total_after, "Status coverage failed: counts don't add up to total rows."

# Deal-level consistency: each deal_id should have a single status
status_per_deal = check.groupby("deal_id")["holding_status"].nunique(dropna=False)
mixed = status_per_deal[status_per_deal > 1]
assert mixed.empty, f"{len(mixed)} deal_id(s) have mixed exited/unexited rows."

print(f"Check passed. exited={exited_count}, unexited={unexited_count}, total_rows={total_after}")

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Added column: holding_status
Check passed. exited=2511, unexited=2445, total_rows=4956
unique_deals: 1164


In [129]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import date

#----- Filter for unreasonable dates -----#
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)

# Parse dates (tolerant). Expect ISO strings like "YYYY-MM-DD" from earlier steps.
ref = pd.to_datetime(df["reference_date"], errors="coerce")
lower = pd.Timestamp(1980, 1, 1).normalize()
q_end = pd.Timestamp.today().to_period("Q-DEC").end_time.normalize()
date_ok = ref.isna() | ((ref >= lower) & (ref <= q_end))

# --- Revenue non-zero (treat non-numeric/NA as "keep")
rev_num = pd.to_numeric(df["revenue"], errors="coerce")
rev_ok = rev_num.fillna(np.inf) != 0

# Keep if reference_date is NA OR within [lower, q_end]; drop otherwise.
keep_mask = date_ok & rev_ok

# Preserve original order
df["_ord"] = np.arange(len(df))
after = df.loc[keep_mask].sort_values("_ord").drop(columns="_ord").reset_index(drop=True)

dropped = before_rows - len(after)
print(f"Dropped {dropped} rows outside [{lower.date()} .. {q_end.date()}] or with revenue == 0.")

# Save
after.to_csv(TARGET_CSV, index=False)

import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
ref2 = pd.to_datetime(check["reference_date"], errors="coerce")

# 1) Dates within bounds (for non-null)
assert ((ref2.dropna() >= lower) & (ref2.dropna() <= q_end)).all(), "Found dates outside bounds."

# 2) No revenue == 0
rev2 = pd.to_numeric(check["revenue"], errors="coerce")
assert not (rev2 == 0).any(), "Found rows with revenue == 0."

print(f"FILTER check passed. Remaining rows: {len(check)}")

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Dropped 31 rows outside [1980-01-01 .. 2025-12-31] or with revenue == 0.
FILTER check passed. Remaining rows: 4925
unique_deals: 1163


In [130]:
from pathlib import Path
import pandas as pd
import numpy as np

#----- Filter out rows that have no revenue or no EBITDA -----#

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)

# Temporary numeric views for presence checks (does not change df)
def num(s):
    return pd.to_numeric(s, errors="coerce")

rev_num   = num(df["revenue"])
ebitda_num= num(df["ebitda"])
ev_num    = num(df["enterprise_value"])
nd_num    = num(df["net_debt"])
eq_num    = num(df["equity"])

# Presence logic
rev_ok    = rev_num.notna()
ebitda_ok = ebitda_num.notna()
trio_non_null = ev_num.notna().astype(int) + nd_num.notna().astype(int) + eq_num.notna().astype(int)
trio_ok   = trio_non_null >= 2

keep_mask = rev_ok & ebitda_ok & trio_ok

# Preserve original order exactly
df["_ord"] = np.arange(len(df))
after = df.loc[keep_mask].sort_values("_ord").drop(columns="_ord").reset_index(drop=True)

dropped = before_rows - len(after)
print(f"Filtering out {dropped} rows (kept {len(after)} of {before_rows}).")

# Save
after.to_csv(TARGET_CSV, index=False)

import pandas as pd
import numpy as np

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
# Recompute numeric presence on the saved data for verification
rev_num   = pd.to_numeric(check["revenue"], errors="coerce")
ebitda_num= pd.to_numeric(check["ebitda"], errors="coerce")
ev_num    = pd.to_numeric(check["enterprise_value"], errors="coerce")
nd_num    = pd.to_numeric(check["net_debt"], errors="coerce")
eq_num    = pd.to_numeric(check["equity"], errors="coerce")

assert rev_num.notna().all(), "Found rows with empty revenue after filtering."
assert ebitda_num.notna().all(), "Found rows with empty ebitda after filtering."
assert ((ev_num.notna().astype(int) + nd_num.notna().astype(int) + eq_num.notna().astype(int)) >= 2).all(), \
       "Found rows with fewer than two of [enterprise_value, net_debt, equity] present."

print("FILTER check passed. Shape:", check.shape)

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Filtering out 2760 rows (kept 2165 of 4925).
FILTER check passed. Shape: (2165, 28)
unique_deals: 831


In [131]:
from pathlib import Path
import pandas as pd
import numpy as np

#----- Calculate missing EV-EqV bridge items -----#

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)

# Numeric views for calculations (do not mutate these)
ev0 = pd.to_numeric(df["enterprise_value"], errors="coerce")
nd0 = pd.to_numeric(df["net_debt"], errors="coerce")
eq0 = pd.to_numeric(df["equity"], errors="coerce")

# Count how many of the trio are present (non-null)
present_cnt = ev0.notna().astype(int) + nd0.notna().astype(int) + eq0.notna().astype(int)
calc_mask = (present_cnt == 2)

# Initialize flag: Yes if we will compute a missing third, else No
df["EV_bridge_calc"] = np.where(calc_mask, "Yes", "No")

# Compute missing field using the other two (using original, unmodified numeric series)
need_ev = calc_mask & ev0.isna()
need_nd = calc_mask & nd0.isna()
need_eq = calc_mask & eq0.isna()

df.loc[need_ev, "enterprise_value"] = (eq0 + nd0)[need_ev]
df.loc[need_nd, "net_debt"]         = (ev0 - eq0)[need_nd]
df.loc[need_eq, "equity"]           = (ev0 - nd0)[need_eq]

# Persist
df.to_csv(TARGET_CSV, index=False)
print(f"Computed missing EV/NetDebt/Equity for {calc_mask.sum()} rows. Added 'EV_bridge_calc'.")

import pandas as pd
import numpy as np

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# Re-evaluate presence after computation
ev = pd.to_numeric(check["enterprise_value"], errors="coerce")
nd = pd.to_numeric(check["net_debt"], errors="coerce")
eq = pd.to_numeric(check["equity"], errors="coerce")
present_cnt_after = ev.notna().astype(int) + nd.notna().astype(int) + eq.notna().astype(int)

# 1) Rows that were marked "Yes" now have all three present
yes_mask = (check["EV_bridge_calc"] == "Yes")
assert (present_cnt_after[yes_mask] == 3).all(), "Some 'Yes' rows still missing a value."

# 2) All rows still meet the earlier rule (>= 2 present)
assert (present_cnt_after >= 2).all(), "Found rows with fewer than two of the trio present."

# 3) Row count preserved
assert len(check) == before_rows, "Row count changed."

print(f"COMPUTE check passed. Bridges computed: {yes_mask.sum()}, total rows: {len(check)}.")

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Computed missing EV/NetDebt/Equity for 1019 rows. Added 'EV_bridge_calc'.
COMPUTE check passed. Bridges computed: 1019, total rows: 2165.
unique_deals: 831


In [132]:
from pathlib import Path
import pandas as pd
import numpy as np

#----- Flag faulty EV-EqV bridges -----#

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# Numeric views
ev = pd.to_numeric(df["enterprise_value"], errors="coerce")
nd = pd.to_numeric(df["net_debt"], errors="coerce")
eq = pd.to_numeric(df["equity"], errors="coerce")

# Require all three present
all3 = ev.notna() & nd.notna() & eq.notna()
# Differences for the three identities
d_ev = ev - (eq + nd)
d_nd = nd - (ev - eq)
d_eq = eq - (ev - nd)

# Tolerance: 3% of |EV| with a small absolute floor
abs_floor = 1e-6
tol = np.maximum(0.03 * ev.abs(), abs_floor)
ok_ev = all3 & (d_ev.abs() <= tol)
ok_nd = all3 & (d_nd.abs() <= tol)
ok_eq = all3 & (d_eq.abs() <= tol)

ok_bridge = ok_ev & ok_nd & ok_eq

# Anything not all3 or exceeding tolerance -> Error
df["EV_bridge_error"] = np.where(ok_bridge, "Ok", "Error")

# Persist
df.to_csv(TARGET_CSV, index=False)
ok_count = (df["EV_bridge_error"] == "Ok").sum()
err_count = (df["EV_bridge_error"] == "Error").sum()
print(f"Bridge check (3% of |EV| tolerance): Ok={ok_count}, Error={err_count}, Total={len(df)}")

import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"id": str})
vc = check["EV_bridge_error"].value_counts()
ok = int(vc.get("Ok", 0))
err = int(vc.get("Error", 0))
assert ok + err == len(check), "Flags do not cover all rows."
print(f"Check passed. Ok={ok}, Error={err}, Total={len(check)}")

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Bridge check (3% of |EV| tolerance): Ok=1856, Error=309, Total=2165
Check passed. Ok=1856, Error=309, Total=2165
unique_deals: 831


In [133]:
from pathlib import Path
import pandas as pd

#----- Remove faulty EV-EqV bridges -----#

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

df = pd.read_csv(TARGET_CSV, dtype={"id": str})
before = len(df)

keep_mask = df["EV_bridge_error"] != "Error"
after = df.loc[keep_mask].reset_index(drop=True)

dropped = before - len(after)
print(f"Dropped {dropped} rows with EV_bridge_error == 'Error'. Kept {len(after)} of {before}.")

after.to_csv(TARGET_CSV, index=False)

check = pd.read_csv(TARGET_CSV, dtype={"id": str})
assert (check["EV_bridge_error"] != "Error").all(), "Found remaining rows with EV_bridge_error == 'Error'."
print("FILTER check passed. Rows:", len(check))

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Dropped 309 rows with EV_bridge_error == 'Error'. Kept 1856 of 2165.
FILTER check passed. Rows: 1856
unique_deals: 751


In [134]:
from pathlib import Path
import pandas as pd
import numpy as np

#----- Entry/ exit date <---> financial data matching -----#

#-----
"""Every unique deal_id with holding_status == "exited" should have only two rows (i.e. id). And these rows should be the ones with the reference date matched as closely as possible to the entry and exit date, if a reference_date exists within a +-3 month window. All deal_id with holding_status == "unexited" should not be touched by the exit operation."""
#-----
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# Parse dates
df["_ref_dt"]   = pd.to_datetime(df["reference_date"], errors="coerce")
df["_entry_dt"] = pd.to_datetime(df["entry_date"],    errors="coerce")
df["_exit_dt"]  = pd.to_datetime(df["exit_date"],     errors="coerce")

# Masks
exited    = df["holding_status"] == "exited"
unexited  = df["holding_status"] == "unexited"

# --- Helper: select closest row to a target date within ±3 calendar months (inclusive)
def select_closest_within_window(frame: pd.DataFrame, target_col: str, ref_col: str = "_ref_dt"):
    """Return a Series indexed by deal_id with the winning 'id' (closest ref within ±3 months)."""
    # Target per deal
    tgt = frame.groupby("deal_id")[target_col].transform("first")
    start = tgt - pd.DateOffset(months=3)
    end   = tgt + pd.DateOffset(months=3)

    in_window = frame[ref_col].between(start, end, inclusive="both")

    # Tie-breakers: 1) min abs diff  2) prefer on/after target  3) earliest ref
    tmp = frame.loc[in_window, ["deal_id", "id", ref_col, target_col]].copy()
    tmp["_abs_diff_days"]  = (tmp[ref_col] - tmp[target_col]).abs().dt.days
    tmp["_is_after_or_eq"] = (tmp[ref_col] >= tmp[target_col]).astype(int)

    tmp_sorted = tmp.sort_values(
        ["deal_id", "_abs_diff_days", "_is_after_or_eq", ref_col],
        ascending=[True, True, False, True],
    )
    # Winner per deal_id
    winners = tmp_sorted.groupby("deal_id", sort=False)["id"].first()
    return winners  # index: deal_id, values: id

# Only operate on exited deals
df_ex = df.loc[exited].copy()

# Entry winner (requires entry_date)
entry_winners = select_closest_within_window(df_ex, target_col="_entry_dt")

# Exit winner (requires exit_date)
exit_winners = select_closest_within_window(df_ex, target_col="_exit_dt")

# Deals must have both winners to survive
entry_ok_deals = set(entry_winners.index)
exit_ok_deals  = set(exit_winners.index)
survivor_deals = entry_ok_deals & exit_ok_deals

# --- NEW: drop deals where entry winner == exit winner (entry=exit)
coincident_deals = {d for d in survivor_deals if entry_winners[d] == exit_winners[d]}
if coincident_deals:
    print(f"Removing {len(coincident_deals)} exited deal(s) where entry and exit map to the same id.")
survivor_deals = survivor_deals - coincident_deals
# --- END NEW

# Build set of ids to keep for exited deals: union of entry+exit winners per surviving deal
keep_ids_exited = set(entry_winners.loc[list(survivor_deals)].tolist()) | set(
    exit_winners.loc[list(survivor_deals)].tolist()
)


# Final keep mask:
# - keep all rows for unexited deals (untouched)
# - for exited deals: keep only winner ids; drop entire deal if not in survivor_deals
keep_mask = unexited | (exited & df["deal_id"].isin(survivor_deals) & df["id"].isin(keep_ids_exited))

before_rows = len(df)
before_deals_ex = df.loc[exited, "deal_id"].nunique()

out = df.loc[keep_mask].copy()

# Drop helpers
out = out.drop(columns=[c for c in ["_ref_dt","_entry_dt","_exit_dt"] if c in out.columns])

# Save
out.to_csv(TARGET_CSV, index=False)

# Reporting
after_rows = len(out)
after_deals_ex = out.loc[out["holding_status"]=="exited", "deal_id"].nunique()
dropped_exited_deals = before_deals_ex - after_deals_ex
print(
    f"Exited deals kept: {after_deals_ex} (dropped {dropped_exited_deals} with no entry/exit match in ±3 months). "
    f"Rows now: {after_rows} (from {before_rows})."
)


import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

ex_mask = check["holding_status"] == "exited"
un_mask = check["holding_status"] == "unexited"

# 1) Unexited deals untouched in cardinality of rows per deal (relative order not asserted here)

rows_per_deal = check.loc[ex_mask].groupby("deal_id")["id"].nunique()
assert (rows_per_deal == 2).all(), "Found exited deals with != 2 kept rows."


# 3) Verify kept rows are within ±3 months of the respective target dates
ck = check.loc[ex_mask].copy()
ref = pd.to_datetime(ck["reference_date"], errors="coerce")
ent = pd.to_datetime(ck["entry_date"], errors="coerce")
exi = pd.to_datetime(ck["exit_date"],  errors="coerce")

# Tag each row as 'entry_candidate' or 'exit_candidate' by closeness
abs_diff_entry = (ref - ent).abs()
abs_diff_exit  = (ref - exi).abs()
is_entry_like = abs_diff_entry <= abs_diff_exit

# Both must be within ±3 months relative to whichever they represent.
from pandas import DateOffset
ok_window = (
    (is_entry_like & ref.between(ent - DateOffset(months=3), ent + DateOffset(months=3), inclusive="both")) |
    (~is_entry_like & ref.between(exi - DateOffset(months=3), exi + DateOffset(months=3), inclusive="both"))
)
assert ok_window.all(), "Kept exited rows outside ±3 months window."

# 4) Summarize counts
two_rows = int((rows_per_deal == 2).sum())
one_row  = int((rows_per_deal == 1).sum())
print(f"Check passed. Exited deals with 2 rows: {two_rows}; with 1 row (entry=exit candidate): {one_row}.")

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Removing 2 exited deal(s) where entry and exit map to the same id.
Exited deals kept: 276 (dropped 135 with no entry/exit match in ±3 months). Rows now: 1471 (from 1856).
Check passed. Exited deals with 2 rows: 276; with 1 row (entry=exit candidate): 0.
unique_deals: 616


In [135]:
from pathlib import Path
import pandas as pd
import numpy as np

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# Parse dates
df["_ref_dt"]   = pd.to_datetime(df["reference_date"], errors="coerce")
df["_entry_dt"] = pd.to_datetime(df["entry_date"],    errors="coerce")

today = pd.Timestamp.today().normalize()

is_unexited = df["holding_status"] == "unexited"
is_exited   = df["holding_status"] == "exited"

# Helper: closest row to target within ±3 calendar months (inclusive)
def select_closest_within_window(frame: pd.DataFrame, target_col: str, ref_col: str = "_ref_dt"):
    """Return Series indexed by deal_id with the winning 'id'."""
    tgt = frame.groupby("deal_id")[target_col].transform("first")
    start = tgt - pd.DateOffset(months=3)
    end   = tgt + pd.DateOffset(months=3)
    in_window = frame[ref_col].between(start, end, inclusive="both")

    tmp = frame.loc[in_window, ["deal_id", "id", ref_col, target_col]].copy()
    if tmp.empty:
        return pd.Series(dtype=object)

    tmp["_abs_diff_days"]  = (tmp[ref_col] - tmp[target_col]).abs().dt.days
    tmp["_is_after_or_eq"] = (tmp[ref_col] >= tmp[target_col]).astype(int)

    tmp_sorted = tmp.sort_values(
        ["deal_id", "_abs_diff_days", "_is_after_or_eq", ref_col],
        ascending=[True, True, False, True],
    )
    return tmp_sorted.groupby("deal_id", sort=False)["id"].first()

# Work only on unexited deals
df_un = df.loc[is_unexited].copy()

# Entry winner within ±3 months (required)
entry_winners_un = select_closest_within_window(df_un, target_col="_entry_dt")
entry_ok_deals = set(entry_winners_un.index)

# Latest ref_date ≤ today (required)
ref_le_today = df_un[df_un["_ref_dt"] <= today].copy()
latest_ids = (
    ref_le_today.sort_values(["deal_id", "_ref_dt"], ascending=[True, False])
                .groupby("deal_id", sort=False)["id"].first()
)
latest_ok_deals = set(latest_ids.index)

# Survivors must have both entry match and a latest≤today
survivor_deals = entry_ok_deals & latest_ok_deals

# Drop deals where entry winner id == latest id
coincident = {d for d in survivor_deals if entry_winners_un[d] == latest_ids[d]}
survivor_deals -= coincident

# Keep exactly the two ids (entry+latest) for survivors; leave exited deals untouched
keep_ids_un = set(entry_winners_un.loc[list(survivor_deals)].tolist()) | set(latest_ids.loc[list(survivor_deals)].tolist())

keep_mask = is_exited | (is_unexited & df["deal_id"].isin(survivor_deals) & df["id"].isin(keep_ids_un))

before_rows = len(df)
out = df.loc[keep_mask].copy()

# Drop helpers and save
out = out.drop(columns=[c for c in ["_ref_dt","_entry_dt"] if c in out.columns])
out.to_csv(TARGET_CSV, index=False)

print(f"Unexited: survivors={len(survivor_deals)}, dropped_coincident={len(coincident)}, rows_now={len(out)} (from {before_rows}).")

import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})
un_mask = check["holding_status"] == "unexited"
ex_mask = check["holding_status"] == "exited"

# Unexited: exactly 2 rows per deal_id
rows_per_un = check.loc[un_mask].groupby("deal_id")["id"].nunique()
assert (rows_per_un == 2).all(), "Unexited deals must have exactly 2 rows."

# Validate the two rows are entry-match and latest≤today
ck_un = check.loc[un_mask].copy()
ck_un["_ref_dt"] = pd.to_datetime(ck_un["reference_date"], errors="coerce")
ck_un["_entry_dt"] = pd.to_datetime(ck_un["entry_date"], errors="coerce")

# Recompute entry winners for verification
def entry_winner_verify(frame):
    from pandas import DateOffset
    tgt = frame.groupby("deal_id")["_entry_dt"].transform("first")
    start = tgt - DateOffset(months=3)
    end   = tgt + DateOffset(months=3)
    in_window = frame["_ref_dt"].between(start, end, inclusive="both")
    tmp = frame.loc[in_window, ["deal_id","id","_ref_dt","_entry_dt"]].copy()
    tmp["_abs_diff_days"]  = (tmp["_ref_dt"] - tmp["_entry_dt"]).abs().dt.days
    tmp["_is_after_or_eq"] = (tmp["_ref_dt"] >= tmp["_entry_dt"]).astype(int)
    tmp = tmp.sort_values(["deal_id","_abs_diff_days","_is_after_or_eq","_ref_dt"],
                          ascending=[True, True, False, True])
    return tmp.groupby("deal_id")["id"].first()

entry_verify = entry_winner_verify(ck_un)
latest_verify = (ck_un[ck_un["_ref_dt"] <= pd.Timestamp.today().normalize()]
                 .sort_values(["deal_id","_ref_dt"], ascending=[True, False])
                 .groupby("deal_id")["id"].first())

for d, grp in ck_un.groupby("deal_id"):
    ids = set(grp["id"])
    assert d in entry_verify.index and d in latest_verify.index, f"Deal {d}: missing entry or latest id."
    assert entry_verify[d] in ids and latest_verify[d] in ids, f"Deal {d}: kept rows are not entry+latest."

# Exited: unchanged cardinality constraint (still ≤ 2)
rows_per_ex = check.loc[ex_mask].groupby("deal_id")["id"].nunique()
assert (rows_per_ex <= 2).all(), "Exited deals show >2 rows after unexited processing."

print("Unexited selection check passed.")

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


Unexited: survivors=202, dropped_coincident=43, rows_now=956 (from 1471).
Unexited selection check passed.
unique_deals: 478


In [136]:
from pathlib import Path
import pandas as pd
import numpy as np

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# --- Parse dates
df["_ref_dt"]   = pd.to_datetime(df["reference_date"], errors="coerce")
df["_entry_dt"] = pd.to_datetime(df["entry_date"],    errors="coerce")
df["_exit_dt"]  = pd.to_datetime(df["exit_date"],     errors="coerce")
today = pd.Timestamp.today().normalize()

# --- Numeric views
ev = pd.to_numeric(df["enterprise_value"], errors="coerce")
nd = pd.to_numeric(df["net_debt"],         errors="coerce")
eq = pd.to_numeric(df["equity"],           errors="coerce")
rev= pd.to_numeric(df["revenue"],          errors="coerce")
eb = pd.to_numeric(df["ebitda"],           errors="coerce")

# --- Bookkeeping
issues = {}

def flag(name, bad_index):
    bad_ids = df.loc[bad_index, "deal_id"].unique().tolist()
    issues[name] = bad_ids

# 1) Exactly two rows per deal_id
rows_per_deal = df.groupby("deal_id")["id"].nunique()
bad = rows_per_deal.index[rows_per_deal != 2]
flag("not_exactly_two_rows", df["deal_id"].isin(bad))

# 2) Required fields present and finite; revenue != 0
req_na = ev.isna() | nd.isna() | eq.isna() | rev.isna() | eb.isna()
flag("missing_required_fields", req_na)
flag("revenue_zero", (rev == 0))

# 3) Date-window checks
is_exited   = df["holding_status"] == "exited"
is_unexited = df["holding_status"] == "unexited"

# Windows (inclusive ±3 calendar months)
from pandas import DateOffset
entry_start = df["_entry_dt"] - DateOffset(months=3)
entry_end   = df["_entry_dt"] + DateOffset(months=3)
exit_start  = df["_exit_dt"]  - DateOffset(months=3)
exit_end    = df["_exit_dt"]  + DateOffset(months=3)

in_entry_win = df["_ref_dt"].between(entry_start, entry_end, inclusive="both")
in_exit_win  = df["_ref_dt"].between(exit_start,  exit_end,  inclusive="both")
le_today     = df["_ref_dt"] <= today

# 3a) Exited: exactly one entry-window row AND exactly one exit-window row per deal
ex = df[is_exited].copy()
ex_grp = ex.groupby("deal_id", as_index=False)
ex_count_entry = ex_grp["id"].apply(lambda s: in_entry_win.loc[s.index].sum()).set_index("deal_id")["id"]
ex_count_exit  = ex_grp["id"].apply(lambda s: in_exit_win.loc[s.index].sum()).set_index("deal_id")["id"]
bad_ex_entry = ex_count_entry.index[ex_count_entry != 1]
bad_ex_exit  = ex_count_exit.index[ex_count_exit != 1]
flag("exited_entry_window_count_!=1", df["deal_id"].isin(bad_ex_entry) & is_exited)
flag("exited_exit_window_count_!=1",  df["deal_id"].isin(bad_ex_exit)  & is_exited)

# 3b) Unexited: exactly one entry-window row; exactly one latest<=today row; they must be different
un = df[is_unexited].copy()
un_grp = un.groupby("deal_id", as_index=False)

un_count_entry = un_grp["id"].apply(lambda s: in_entry_win.loc[s.index].sum()).set_index("deal_id")["id"]
bad_un_entry_count = un_count_entry.index[un_count_entry != 1]
flag("unexited_entry_window_count_!=1", df["deal_id"].isin(bad_un_entry_count) & is_unexited)

# latest<=today id per unexited deal
un_le_today = un[le_today.loc[un.index]].copy()
latest_id = (un_le_today.sort_values(["deal_id","_ref_dt"], ascending=[True, False])
                       .groupby("deal_id")["id"].first())

# deals lacking any ref_date <= today
un_deals = un["deal_id"].unique()
missing_latest = [d for d in un_deals if d not in latest_id.index]
flag("unexited_missing_latest_le_today", df["deal_id"].isin(missing_latest) & is_unexited)

# entry-winner id per unexited deal (pick closest within window as earlier)
def entry_winner_ids(frame):
    tmp = frame.copy()
    tmp = tmp[in_entry_win.loc[tmp.index]]
    if tmp.empty:
        return pd.Series(dtype=object)
    tmp["_abs_diff_days"]  = (tmp["_ref_dt"] - tmp["_entry_dt"]).abs().dt.days
    tmp["_is_after_or_eq"] = (tmp["_ref_dt"] >= tmp["_entry_dt"]).astype(int)
    tmp = tmp.sort_values(["deal_id","_abs_diff_days","_is_after_or_eq","_ref_dt"],
                          ascending=[True, True, False, True])
    return tmp.groupby("deal_id")["id"].first()

entry_id_un = entry_winner_ids(un)

# Check distinctness and membership
bad_un_distinct = []
bad_un_membership = []
for d, sub in un.groupby("deal_id"):
    if d in bad_un_entry_count or d in missing_latest:
        continue  # already flagged above
    ids = set(sub["id"])
    e_id = entry_id_un.get(d, None)
    l_id = latest_id.get(d, None)
    if e_id is None or l_id is None:
        continue
    if e_id == l_id:
        bad_un_distinct.append(d)
    if e_id not in ids or l_id not in ids:
        bad_un_membership.append(d)

flag("unexited_entry_equals_latest", df["deal_id"].isin(bad_un_distinct) & is_unexited)
flag("unexited_missing_entry_or_latest_row", df["deal_id"].isin(bad_un_membership) & is_unexited)

# 4) EV–EqV–ND bridge with 3%*|EV| tolerance (independent identities)
abs_floor = 1e-9
tol = np.maximum(0.03 * ev.abs(), abs_floor)
bad_ev = (ev - (eq + nd)).abs() > tol
bad_nd = (nd - (ev - eq)).abs() > tol
bad_eq = (eq - (ev - nd)).abs() > tol
flag("ev_bridge_ev_vs_eq_plus_nd", bad_ev)
flag("ev_bridge_nd_vs_ev_minus_eq", bad_nd)
flag("ev_bridge_eq_vs_ev_minus_nd", bad_eq)

# 5) Basic key integrity
flag("id_not_unique", df["id"].duplicated(keep=False))
flag("deal_id_null", df["deal_id"].isna())

# --- Summary
total_deals = df["deal_id"].nunique()
failed = {k: len(v) for k, v in issues.items() if v}
print(f"Deals in sample: {total_deals}")
if failed:
    for k, n in failed.items():
        print(f"- {k}: {n} deal(s)")
    # Optional: assert no failures
    # assert False, "QA failed; see counts above."
else:
    print("QA passed: all checks satisfied.")


Deals in sample: 478
- unexited_entry_window_count_!=1: 7 deal(s)
