In [1]:
# === Holding_status, date/revenue filters, EV bridge compute/flag/filter ===

from pathlib import Path
import pandas as pd
import numpy as np

# ---- helper ----
def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# ===================== 0) Optional: per-deal minimum row guard (default OFF) =====================
ENFORCE_MIN_ROWS_PER_DEAL = False   # optional safeguard; default OFF → no effect on output
MIN_ROWS_PER_DEAL = 3              # “>2 would be correct” ⇒ at least 3 rows per deal

def maybe_enforce_min_rows(frame: pd.DataFrame, label: str) -> pd.DataFrame:
    if not ENFORCE_MIN_ROWS_PER_DEAL:
        return frame
    counts = frame.groupby("deal_id")["id"].count()
    keep_deals = counts[counts >= MIN_ROWS_PER_DEAL].index
    before = frame["deal_id"].nunique()
    out = frame[frame["deal_id"].isin(keep_deals)].copy()
    print(f"{label}: Min-row guard kept {len(keep_deals)} deals (≥{MIN_ROWS_PER_DEAL} rows); "
          f"dropped {before - len(keep_deals)}.")
    return out

# ===================== 1) Add holding_status (+ optional exited-only filter) =====================
HOLDING_FILTER_MODE = "both"   # options: "both" (default) | "exited_only"

df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
df["_ord"] = np.arange(len(df))  # preserve original row order

if "exit_date" not in df.columns:
    raise KeyError("Column 'exit_date' not found in working.csv. Run the earlier ADD_COLUMNS step from 'deal' first.")

# holding status (SAFER: parse to datetime and test .notna())
norm_exit = pd.to_datetime(df["exit_date"], errors="coerce")
is_exited = norm_exit.notna()
df["holding_status"] = is_exited.map({True: "exited", False: "unexited"})

# enforce deal-level consistency before any filtering
status_per_deal = df.groupby("deal_id")["holding_status"].nunique(dropna=False)
mixed = status_per_deal[status_per_deal > 1]
assert mixed.empty, f"{len(mixed)} deal_id(s) have mixed exited/unexited rows."

# optional: exited-only filter (deal-wide)
if HOLDING_FILTER_MODE == "exited_only":
    deal_first_status = df.groupby("deal_id")["holding_status"].first()
    keep_deals = set(deal_first_status[deal_first_status == "exited"].index)
    before_rows, before_deals = len(df), df["deal_id"].nunique()
    df = df[df["deal_id"].isin(keep_deals)].copy()
    df = df.sort_values("_ord").drop(columns="_ord").reset_index(drop=True)
    print(f"Mode=exited_only: kept {len(keep_deals)} exited deal(s); "
          f"dropped {before_deals - len(keep_deals)} unexited. Rows now {len(df)} (from {before_rows}).")
else:
    df = df.sort_values("_ord").drop(columns="_ord").reset_index(drop=True)
    print("Mode=both: no deal-level filtering applied.")

# persist
df.to_csv(TARGET_CSV, index=False)

# post-write checks
check = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})
exited_count = (check["holding_status"] == "exited").sum()
unexited_count = (check["holding_status"] == "unexited").sum()
total_after = len(check)
assert exited_count + unexited_count == total_after, "Status coverage failed."

status_per_deal = check.groupby("deal_id")["holding_status"].nunique(dropna=False)
assert (status_per_deal <= 1).all(), "Found deal(s) with mixed statuses after filtering."

if HOLDING_FILTER_MODE == "exited_only":
    deal_status = check.groupby("deal_id")["holding_status"].first()
    assert (deal_status == "exited").all(), "Non-exited deal(s) remain in exited_only mode."

# report unique deals by status
by_status = (check.drop_duplicates(["deal_id", "holding_status"])
                   .groupby("holding_status")["deal_id"].nunique())
print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    check["deal_id"].nunique()
)

# ===================== 2) Filter unreasonable dates =====================
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)

ref = pd.to_datetime(df["reference_date"], errors="coerce")
lower = pd.Timestamp(1976, 1, 1).normalize()
q_end = pd.Timestamp.today().to_period("Q-DEC").end_time.normalize()
date_ok = ref.notna() & (ref >= lower) & (ref <= q_end)

df["_ord"] = np.arange(len(df))
after = df.loc[date_ok].sort_values("_ord").drop(columns="_ord").reset_index(drop=True)
after = maybe_enforce_min_rows(after, label="FILTER(dates)")  # optional; no-op by default
dropped = before_rows - len(after)
print(
    f"Dropped {dropped} rows due to missing/out-of-range reference_date. "
    f"Kept range [{lower.date()} .. {q_end.date()}]."
)
after.to_csv(TARGET_CSV, index=False)

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
ref2 = pd.to_datetime(check["reference_date"], errors="coerce")
assert ref2.notna().all(), "Found rows with null reference_date."
assert ((ref2 >= lower) & (ref2 <= q_end)).all(), "Found dates outside bounds."
print(f"FILTER (dates) check passed. Remaining rows: {len(check)}")

p = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
g = pd.read_csv(p, dtype={"deal_id": str})
by_status = (g.drop_duplicates(["deal_id", "holding_status"])
               .groupby("holding_status")["deal_id"].nunique())
print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    g["deal_id"].nunique()
)

# ===================== 3) Filter for revenue>0, EBITDA (toggle), and <2 of [EV/ND/Eq] =============
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)

# ---- Toggle: Set to True to require strictly positive EBITDA; False to only require non-missing EBITDA
REQUIRE_POSITIVE_EBITDA = True

def num(s): return pd.to_numeric(s, errors="coerce")
rev_num    = num(df["revenue"])
ebitda_num = num(df["ebitda"])
ev_num     = num(df["enterprise_value"])
nd_num     = num(df["net_debt"])
eq_num     = num(df["equity"])

rev_ok    = rev_num > 0
if REQUIRE_POSITIVE_EBITDA:
    ebitda_ok = ebitda_num > 0
else:
    ebitda_ok = ebitda_num.notna()

trio_non_null = ev_num.notna().astype(int) + nd_num.notna().astype(int) + eq_num.notna().astype(int)
trio_ok       = trio_non_null >= 2

keep_mask = rev_ok & ebitda_ok & trio_ok

df["_ord"] = np.arange(len(df))
after = df.loc[keep_mask].sort_values("_ord").drop(columns="_ord").reset_index(drop=True)
after = maybe_enforce_min_rows(after, label="FILTER(revenue/EBITDA/trio)")  # optional; no-op by default
dropped = before_rows - len(after)
mode_str = "EBITDA>0" if REQUIRE_POSITIVE_EBITDA else "non-missing EBITDA"
print(f"Filtering out {dropped} rows (kept {len(after)} of {before_rows}) using revenue>0, {mode_str}, and ≥2 of [EV, ND, Eq].")
after.to_csv(TARGET_CSV, index=False)

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
rev_num    = pd.to_numeric(check["revenue"], errors="coerce")
ebitda_num = pd.to_numeric(check["ebitda"], errors="coerce")
ev_num     = pd.to_numeric(check["enterprise_value"], errors="coerce")
nd_num     = pd.to_numeric(check["net_debt"], errors="coerce")
eq_num     = pd.to_numeric(check["equity"], errors="coerce")

# Assertions aligned with the new rules
assert (rev_num > 0).all(), "Found rows with revenue <= 0 or non-numeric."
if REQUIRE_POSITIVE_EBITDA:
    assert (ebitda_num > 0).all(), "Found rows with EBITDA <= 0 or non-numeric."
else:
    assert ebitda_num.notna().all(), "Found rows with empty EBITDA after filtering."
assert ((ev_num.notna().astype(int) + nd_num.notna().astype(int) + eq_num.notna().astype(int)) >= 2).all(), \
       "Found rows with fewer than two of [enterprise_value, net_debt, equity] present."
print("FILTER (revenue/EBITDA/trio) check passed. Shape:", check.shape)

p = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
g = pd.read_csv(p, dtype={"deal_id": str})
by_status = (g.drop_duplicates(["deal_id", "holding_status"])
               .groupby("holding_status")["deal_id"].nunique())
print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    g["deal_id"].nunique()
)

# ==== 4) EV / ND / Equity bridge: compute missing, flag, and filter (consolidated) ====

ENFORCE_POSITIVE_EV_AND_EQ = True  # enforces EV>0, Eq>0, ND>0 when True

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# -- load & preserve order
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
df["_ord"] = np.arange(len(df))
before_rows = len(df); before_deals = df["deal_id"].nunique()

# -- numeric views
ev0 = pd.to_numeric(df["enterprise_value"], errors="coerce")
nd0 = pd.to_numeric(df["net_debt"],         errors="coerce")
eq0 = pd.to_numeric(df["equity"],           errors="coerce")

# -- (1) compute missing member when exactly two present
present_cnt = ev0.notna().astype(int) + nd0.notna().astype(int) + eq0.notna().astype(int)
calc_mask = (present_cnt == 2)
df["EV_bridge_calc"] = np.where(calc_mask, "Yes", "No")

need_ev = calc_mask & ev0.isna()
need_nd = calc_mask & nd0.isna()
need_eq = calc_mask & eq0.isna()
if need_ev.any(): df.loc[need_ev, "enterprise_value"] = (eq0 + nd0)[need_ev]
if need_nd.any(): df.loc[need_nd, "net_debt"]         = (ev0 - eq0)[need_nd]
if need_eq.any(): df.loc[need_eq, "equity"]           = (ev0 - nd0)[need_eq]

# -- (2) flag residual vs. EV with tolerance (hardcoded, inclusive)
ev = pd.to_numeric(df["enterprise_value"], errors="coerce")
nd = pd.to_numeric(df["net_debt"],         errors="coerce")
eq = pd.to_numeric(df["equity"],           errors="coerce")

all3 = ev.notna() & nd.notna() & eq.notna()
residual = ev - (eq + nd)
tol = 1001.0  # units; inclusive keeps residual==1000
ok = all3 & (residual.abs() <= tol)

df["EV_bridge_error"] = np.where(ok, "Ok", "Error")
df["EV_bridge_residual"] = residual

# -- (3) mandatory filter: remove faulty bridges (row-level)
before_faulty_rows = len(df)
df_ok = df.loc[df["EV_bridge_error"] != "Error"].copy()
dropped_faulty = before_faulty_rows - len(df_ok)

# -- (4) optional deal-wide positivity filter on EV, Equity, and Net Debt (AFTER fill & error drop)
if ENFORCE_POSITIVE_EV_AND_EQ:
    ev_pos = pd.to_numeric(df_ok["enterprise_value"], errors="coerce") > 0
    eq_pos = pd.to_numeric(df_ok["equity"],           errors="coerce") > 0
    nd_pos = pd.to_numeric(df_ok["net_debt"],         errors="coerce") > 0
    ok_row = ev_pos & eq_pos & nd_pos

    ok_deal = ok_row.groupby(df_ok["deal_id"]).all()
    keep_deals = set(ok_deal[ok_deal].index)

    before_pos_deals = df_ok["deal_id"].nunique()
    df_ok = df_ok[df_ok["deal_id"].isin(keep_deals)].copy()
    dropped_pos_deals = before_pos_deals - len(keep_deals)
else:
    dropped_pos_deals = 0

# -- optional: enforce minimum rows per deal (default OFF)
df_ok = maybe_enforce_min_rows(df_ok, label="EV-bridge/positivity")

# -- persist in original order
df_ok = df_ok.sort_values("_ord").drop(columns="_ord").reset_index(drop=True)
df_ok.to_csv(TARGET_CSV, index=False)

# -- checks
check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
ev_c = pd.to_numeric(check["enterprise_value"], errors="coerce")
nd_c = pd.to_numeric(check["net_debt"],         errors="coerce")
eq_c = pd.to_numeric(check["equity"],           errors="coerce")

present_cnt_after = ev_c.notna().astype(int) + nd_c.notna().astype(int) + eq_c.notna().astype(int)

yes_mask = (check.get("EV_bridge_calc", pd.Series(index=check.index, data="No")) == "Yes")
assert (present_cnt_after[yes_mask] == 3).all(), "Some 'Yes' rows still missing EV/ND/Eq."
assert (present_cnt_after >= 2).all(), "Found rows with fewer than two of [EV, ND, Eq]."
assert (check["EV_bridge_error"] != "Error").all(), "Faulty bridges remain after filter."

if ENFORCE_POSITIVE_EV_AND_EQ:
    assert (ev_c > 0).all() and (eq_c > 0).all() and (nd_c > 0).all(), "Non-positive EV/Eq/ND survived positivity filter."

# -- reporting
print(f"Computed missing EV/ND/Eq where exactly two present. calc_flag rows: {int(calc_mask.sum())}")
print(f"Dropped {dropped_faulty} rows with EV_bridge_error == 'Error'.")
if ENFORCE_POSITIVE_EV_AND_EQ:
    print(f"Dropped {dropped_pos_deals} deal(s) due to EV<=0 or Equity<=0 or NetDebt<=0 in any row.")
print(f"Rows now: {len(check)} (from {before_rows}); deals now: {check['deal_id'].nunique()} (from {before_deals}).")

by_status = (check.drop_duplicates(["deal_id","holding_status"])
                   .groupby("holding_status")["deal_id"].nunique())
print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    check["deal_id"].nunique()
)


Mode=both: no deal-level filtering applied.
unique_deals_exited: 560 unique_deals_unexited: 604 unique_deals_total: 1164
Dropped 13 rows due to missing/out-of-range reference_date. Kept range [1976-01-01 .. 2025-12-31].
FILTER (dates) check passed. Remaining rows: 4943
unique_deals_exited: 560 unique_deals_unexited: 604 unique_deals_total: 1164
Filtering out 3031 rows (kept 1912 of 4943) using revenue>0, EBITDA>0, and ≥2 of [EV, ND, Eq].
FILTER (revenue/EBITDA/trio) check passed. Shape: (1912, 28)
unique_deals_exited: 438 unique_deals_unexited: 357 unique_deals_total: 795
Computed missing EV/ND/Eq where exactly two present. calc_flag rows: 814
Dropped 471 rows with EV_bridge_error == 'Error'.
Dropped 113 deal(s) due to EV<=0 or Equity<=0 or NetDebt<=0 in any row.
Rows now: 1144 (from 1912); deals now: 505 (from 795).
unique_deals_exited: 266 unique_deals_unexited: 239 unique_deals_total: 505


In [2]:
# === Date-to-financial matching for exited and unexited deals and currency integrity ===

from pathlib import Path
import pandas as pd
import numpy as np

# ---- helper(s) ----
def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

def select_closest_within_window(frame: pd.DataFrame, target_col: str, ref_col: str = "_ref_dt"):
    tgt = frame.groupby("deal_id")[target_col].transform("first")
    start = tgt - pd.DateOffset(months=3)
    end   = tgt + pd.DateOffset(months=3)

    in_window = frame[ref_col].ge(start) & frame[ref_col].le(end)
    tmp = frame.loc[in_window, ["deal_id", "id", ref_col, target_col]].copy()
    if tmp.empty:
        return pd.Series(dtype=object)

    tmp["_abs_diff_days"]  = (tmp[ref_col] - tmp[target_col]).abs().dt.days
    tmp["_is_after_or_eq"] = (tmp[ref_col] >= tmp[target_col]).astype(int)

    tmp_sorted = tmp.sort_values(
        ["deal_id", "_abs_diff_days", "_is_after_or_eq", ref_col],
        ascending=[True, True, False, True],
    )
    winners = tmp_sorted.groupby("deal_id", sort=False)["id"].first()
    return winners  # index: deal_id, values: id

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# ===================== 1) Exited deals: keep entry+exit matched rows within ±3 months =====================
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# Parse dates
df["_ref_dt"]   = pd.to_datetime(df["reference_date"], errors="coerce")
df["_entry_dt"] = pd.to_datetime(df["entry_date"],    errors="coerce")
df["_exit_dt"]  = pd.to_datetime(df["exit_date"],     errors="coerce")

# Masks
exited    = df["holding_status"] == "exited"
unexited  = df["holding_status"] == "unexited"

# Only operate on exited deals
df_ex = df.loc[exited].copy()

# Entry winner (requires entry_date)
entry_winners = select_closest_within_window(df_ex, target_col="_entry_dt")

# Exit winner (requires exit_date)
exit_winners = select_closest_within_window(df_ex, target_col="_exit_dt")

# Deals must have both winners to survive
entry_ok_deals = set(entry_winners.index)
exit_ok_deals  = set(exit_winners.index)
survivor_deals = entry_ok_deals & exit_ok_deals

# Drop deals where entry winner == exit winner (entry=exit)
coincident_deals = {d for d in survivor_deals if entry_winners[d] == exit_winners[d]}
if coincident_deals:
    print(f"Removing {len(coincident_deals)} exited deal(s) where entry and exit map to the same id.")
survivor_deals = survivor_deals - coincident_deals

# Keep set for exited deals: union of entry+exit winners
keep_ids_exited = set(entry_winners.loc[list(survivor_deals)].tolist()) | set(
    exit_winners.loc[list(survivor_deals)].tolist()
)

# Final keep mask:
# - keep all rows for unexited deals (untouched)
# - for exited deals: keep only winner ids; drop entire deal if not in survivor_deals
keep_mask = unexited | (exited & df["deal_id"].isin(survivor_deals) & df["id"].isin(keep_ids_exited))

before_rows = len(df)
before_deals_ex = df.loc[exited, "deal_id"].nunique()

out = df.loc[keep_mask].copy()

# Drop helpers
out = out.drop(columns=[c for c in ["_ref_dt","_entry_dt","_exit_dt"] if c in out.columns])

# Save
out.to_csv(TARGET_CSV, index=False)

# Reporting
after_rows = len(out)
after_deals_ex = out.loc[out["holding_status"]=="exited", "deal_id"].nunique()
dropped_exited_deals = before_deals_ex - after_deals_ex
print(
    f"Exited deals kept: {after_deals_ex} (dropped {dropped_exited_deals} with no entry/exit match in ±3 months). "
    f"Rows now: {after_rows} (from {before_rows})."
)

# --- Validation for exited ---
check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

ex_mask = check["holding_status"] == "exited"
un_mask = check["holding_status"] == "unexited"

# 1) Unexited deals untouched in cardinality of rows per deal (relative order not asserted here)

rows_per_deal = check.loc[ex_mask].groupby("deal_id")["id"].nunique()
assert (rows_per_deal == 2).all(), "Found exited deals with != 2 kept rows."

# 3) Verify kept rows are within ±3 months of the respective target dates
ck = check.loc[ex_mask].copy()
ref = pd.to_datetime(ck["reference_date"], errors="coerce")
ent = pd.to_datetime(ck["entry_date"], errors="coerce")
exi = pd.to_datetime(ck["exit_date"],  errors="coerce")

# Tag each row as 'entry_candidate' or 'exit_candidate' by closeness
abs_diff_entry = (ref - ent).abs()
abs_diff_exit  = (ref - exi).abs()
is_entry_like = abs_diff_entry <= abs_diff_exit

from pandas import DateOffset
ok_window = (
    (is_entry_like &
     ck["reference_date"].pipe(pd.to_datetime, errors="coerce").ge(ent - DateOffset(months=3)) &
     ck["reference_date"].pipe(pd.to_datetime, errors="coerce").le(ent + DateOffset(months=3)))
    |
    (~is_entry_like &
     ck["reference_date"].pipe(pd.to_datetime, errors="coerce").ge(exi - DateOffset(months=3)) &
     ck["reference_date"].pipe(pd.to_datetime, errors="coerce").le(exi + DateOffset(months=3)))
)

assert ok_window.all(), "Kept exited rows outside ±3 months window."

# 4) Summarize counts
two_rows = int((rows_per_deal == 2).sum())
one_row  = int((rows_per_deal == 1).sum())
print(f"Check passed. Exited deals with 2 rows: {two_rows}; with 1 row (entry=exit candidate): {one_row}.")

p = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
g = pd.read_csv(p, dtype={"deal_id": str})
by_status = (g.drop_duplicates(["deal_id", "holding_status"])
               .groupby("holding_status")["deal_id"].nunique())

print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    g["deal_id"].nunique()
)
print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())

# ===================== 2) Unexited deals: keep entry match (±3 months) + latest ref_date ≤ today =====================
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# Parse dates
df["_ref_dt"]   = pd.to_datetime(df["reference_date"], errors="coerce")
df["_entry_dt"] = pd.to_datetime(df["entry_date"],    errors="coerce")

today = pd.Timestamp.today().normalize()

is_unexited = df["holding_status"] == "unexited"
is_exited   = df["holding_status"] == "exited"

# Work only on unexited deals
df_un = df.loc[is_unexited].copy()

# Entry winner within ±3 months (required)
entry_winners_un = select_closest_within_window(df_un, target_col="_entry_dt")
entry_ok_deals = set(entry_winners_un.index)

# Latest ref_date ≤ today (required)
ref_le_today = df_un[df_un["_ref_dt"] <= today].copy()
latest_ids = (
    ref_le_today.sort_values(["deal_id", "_ref_dt"], ascending=[True, False])
                .groupby("deal_id", sort=False)["id"].first()
)
latest_ok_deals = set(latest_ids.index)

# Survivors must have both entry match and a latest≤today
survivor_deals = entry_ok_deals & latest_ok_deals

# Drop deals where entry winner id == latest id
coincident = {d for d in survivor_deals if entry_winners_un[d] == latest_ids[d]}
survivor_deals -= coincident

# Keep exactly the two ids (entry+latest) for survivors; leave exited deals untouched
keep_ids_un = set(entry_winners_un.loc[list(survivor_deals)].tolist()) | set(latest_ids.loc[list(survivor_deals)].tolist())

keep_mask = is_exited | (is_unexited & df["deal_id"].isin(survivor_deals) & df["id"].isin(keep_ids_un))

before_rows = len(df)
out = df.loc[keep_mask].copy()

# Drop helpers and save
out = out.drop(columns=[c for c in ["_ref_dt","_entry_dt"] if c in out.columns])
out.to_csv(TARGET_CSV, index=False)

print(f"Unexited: survivors={len(survivor_deals)}, dropped_coincident={len(coincident)}, rows_now={len(out)} (from {before_rows}).")

# --- Validation for unexited & global constraints ---
check = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})
un_mask = check["holding_status"] == "unexited"
ex_mask = check["holding_status"] == "exited"

# Unexited: exactly 2 rows per deal_id
rows_per_un = check.loc[un_mask].groupby("deal_id")["id"].nunique()
assert (rows_per_un == 2).all(), "Unexited deals must have exactly 2 rows."

# Validate the two rows are entry-match and latest≤today
ck_un = check.loc[un_mask].copy()
ck_un["_ref_dt"] = pd.to_datetime(ck_un["reference_date"], errors="coerce")
ck_un["_entry_dt"] = pd.to_datetime(ck_un["entry_date"], errors="coerce")

def entry_winner_verify(frame):
    from pandas import DateOffset
    tgt = frame.groupby("deal_id")["_entry_dt"].transform("first")
    start = tgt - DateOffset(months=3)
    end   = tgt + DateOffset(months=3)
    in_window = frame["_ref_dt"].ge(start) & frame["_ref_dt"].le(end)
    tmp = frame.loc[in_window, ["deal_id","id","_ref_dt","_entry_dt"]].copy()
    tmp["_abs_diff_days"]  = (tmp["_ref_dt"] - tmp["_entry_dt"]).abs().dt.days
    tmp["_is_after_or_eq"] = (tmp["_ref_dt"] >= tmp["_entry_dt"]).astype(int)
    tmp = tmp.sort_values(["deal_id","_abs_diff_days","_is_after_or_eq","_ref_dt"],
                          ascending=[True, True, False, True])
    return tmp.groupby("deal_id")["id"].first()

entry_verify = entry_winner_verify(ck_un)
latest_verify = (ck_un[ck_un["_ref_dt"] <= pd.Timestamp.today().normalize()]
                 .sort_values(["deal_id","_ref_dt"], ascending=[True, False])
                 .groupby("deal_id")["id"].first())

for d, grp in ck_un.groupby("deal_id"):
    ids = set(grp["id"])
    assert d in entry_verify.index and d in latest_verify.index, f"Deal {d}: missing entry or latest id."
    assert entry_verify[d] in ids and latest_verify[d] in ids, f"Deal {d}: kept rows are not entry+latest."

# Exited: unchanged cardinality constraint (still ≤ 2)
rows_per_ex = check.loc[ex_mask].groupby("deal_id")["id"].nunique()

# Deal-level row-count report for unexited
two_rows_un  = int((rows_per_un == 2).sum())
one_row_un   = int((rows_per_un == 1).sum())
gt2_rows_un  = int((rows_per_un > 2).sum())
total_un     = int(rows_per_un.size)

print(f"Unexited row-counts per deal_id — 2 rows: {two_rows_un}, 1 row: {one_row_un}, >2 rows: {gt2_rows_un}, total: {total_un}")

# Keep your hard guarantees
assert one_row_un == 0, "Unexited deals with exactly 1 row found."
assert gt2_rows_un == 0, "Unexited deals with >2 rows found."
assert (rows_per_un == 2).all(), "Unexited deals must have exactly 2 rows."
assert (rows_per_ex <= 2).all(), "Exited deals show >2 rows after unexited processing."

# Deal-level row-count report for ALL deals (exited + unexited)
rows_per_all = check.groupby("deal_id")["id"].nunique()
two_rows_all = int((rows_per_all == 2).sum())
one_row_all  = int((rows_per_all == 1).sum())
gt2_rows_all = int((rows_per_all > 2).sum())
total_all    = int(rows_per_all.size)

# ---------- Step 3b: Enforce one currency per deal_id (drop mixed/unknown) ---------- #
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

if "reporting_currency_financials" not in df.columns:
    raise KeyError("Missing 'reporting_currency_financials'—ensure Step 3 added it.")

cur = df["reporting_currency_financials"].astype(str).str.strip()
null_tokens = {"", "nan", "na", "n/a", "none", "null"}
cur = cur.mask(cur.str.lower().isin(null_tokens))

df["_currency"] = cur

# Rule 1: require a defined currency for every row
rows_with_null_cur = df["_currency"].isna().sum()

# Rule 2: require exactly one distinct currency per deal_id
per_deal_nuniq = df.groupby("deal_id")["_currency"].nunique(dropna=False)
bad_deals = per_deal_nuniq[(per_deal_nuniq != 1)].index

keep_mask = df["deal_id"].isin(bad_deals) == False
keep_mask &= df["_currency"].notna()

before_deals = df["deal_id"].nunique()
before_rows  = len(df)

out = df.loc[keep_mask].drop(columns=["_currency"]).reset_index(drop=True)
out.to_csv(TARGET_CSV, index=False)

after_deals = out["deal_id"].nunique()
after_rows  = len(out)

print(f"Currency consistency: dropped {before_deals - after_deals} deal_id(s) with mixed/unknown currencies; "
      f"rows: {after_rows} (from {before_rows}).")

# Hard assertions
check = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})
assert check["reporting_currency_financials"].notna().all(), "Null currency remains."
nu = check.groupby("deal_id")["reporting_currency_financials"].nunique()
assert (nu == 1).all(), "Found deal(s) with multiple currencies."
print("CURRENCY check passed.")


print(f"All deals — 2 rows: {two_rows_all}, 1 row: {one_row_all}, >2 rows: {gt2_rows_all}, total: {total_all}")

# Hard guarantees across the whole dataset
assert one_row_all == 0, "Found deals with exactly 1 row."
assert gt2_rows_all == 0, "Found deals with >2 rows."
assert (rows_per_all == 2).all(), "All deals must have exactly 2 rows."

print("Unexited selection check passed.")

p = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
g = pd.read_csv(p, dtype={"deal_id": str})
by_status = (g.drop_duplicates(["deal_id", "holding_status"])
               .groupby("holding_status")["deal_id"].nunique())

print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    g["deal_id"].nunique()
)


Removing 1 exited deal(s) where entry and exit map to the same id.
Exited deals kept: 167 (dropped 99 with no entry/exit match in ±3 months). Rows now: 952 (from 1144).
Check passed. Exited deals with 2 rows: 167; with 1 row (entry=exit candidate): 0.
unique_deals_exited: 167 unique_deals_unexited: 239 unique_deals_total: 406
unique_deals: 406
Unexited: survivors=130, dropped_coincident=27, rows_now=594 (from 952).
Unexited row-counts per deal_id — 2 rows: 130, 1 row: 0, >2 rows: 0, total: 130
Currency consistency: dropped 1 deal_id(s) with mixed/unknown currencies; rows: 592 (from 594).
CURRENCY check passed.
All deals — 2 rows: 297, 1 row: 0, >2 rows: 0, total: 297
Unexited selection check passed.
unique_deals_exited: 167 unique_deals_unexited: 129 unique_deals_total: 296


In [3]:
# === Calculate metrics for analysis + then drop unreasonable metrics + interest rate selector ===

from pathlib import Path
import pandas as pd
import numpy as np

# ---- helper ----
def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# ============== 1) Add calculated value bridge fundamental columns (row-wise) ============
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
# Exclude faulty deals (as in your current logic)
df = df.loc[df["deal_id"] != "83283299-100b-44c6-a997-ec634d90768d"].copy()  # unrealistic revenue level
df = df.loc[df["deal_id"] != "2376a9b2-d983-48fc-b62a-52fb78bf1409"].copy()  # distorts TM scaling / broken deal
df = df.loc[df["deal_id"] != "98cb6742-e749-4344-85ab-5d5713d30e59"].copy()
df = df.loc[df["deal_id"] != "0906412f-6c13-4ae5-a973-2a860ca19a30"].copy() # 71x Times Money Carve Out, toggle on and off to see effect in "ValueCreationBySourcingType", probably broken deal


def num(s): return pd.to_numeric(s, errors="coerce")
ev = num(df["enterprise_value"])
eb = num(df["ebitda"])
nd = num(df["net_debt"])
eq = num(df["equity"])
rv = num(df["revenue"])

with np.errstate(divide="ignore", invalid="ignore"):
    ebitda_margin = (eb / rv).where(rv != 0)
with np.errstate(divide="ignore", invalid="ignore"):
    xebitda = (ev / eb).where(eb != 0)
with np.errstate(divide="ignore", invalid="ignore"):
    de_ratio = (nd / eq).where(eq != 0)

# --- dividends & capital_injections at DEAL level from EXIT row (latest ref_date) ---
df["_ref_dt"] = pd.to_datetime(df["reference_date"], errors="coerce")
# rank within deal: 1 = entry (earliest), 2 = exit (latest)
df["_rank"] = df.groupby("deal_id")["_ref_dt"].rank(method="first", ascending=True)

# equity at exit row (rank 2), per deal
exit_eq = (df.loc[df["_rank"] == 2, ["deal_id"]]
             .assign(exit_eq=eq[df["_rank"] == 2].values)
             .groupby("deal_id")["exit_eq"].first())

# broadcast to both rows of each deal
dividends = df["deal_id"].map((0.0000001 * exit_eq).to_dict()).fillna(0.0000001)
capital_injections = df["deal_id"].map((-0.0000001 * exit_eq).to_dict()).fillna(-0.0000001)
# skip dividends and capital injections
# --- END dividends/cap injections ---

# ============== 2) Interest rate selection  ============
rates_path = find_upwards(Path("InputData/monthly_interest_rates_curr.csv"))
rates = pd.read_csv(rates_path)

# Clean headers and parse dates to month-start
rates.columns = [c.strip().upper() for c in rates.columns]
assert "DATE" in rates.columns, f"'DATE' column not found in {rates_path.name}"
rates["DATE"] = pd.to_datetime(rates["DATE"], errors="coerce").dt.to_period("M").dt.to_timestamp()

# available currency columns in the curve file
curve_ccy_cols = [c for c in rates.columns if c != "DATE"]
curve_ccy_set = set(curve_ccy_cols)

# per-deal entry month & currency (take first non-null per deal)
entry_dt = pd.to_datetime(df["entry_date"], errors="coerce")
deal_meta = (
    df.assign(_entry_dt=entry_dt)
      .sort_values(["deal_id", "_entry_dt"])  # stable "first"
      .groupby("deal_id", as_index=False)
      .agg({
          "_entry_dt": "first",
          "reporting_currency_financials": "first",
      })
)
deal_meta["entry_month"] = deal_meta["_entry_dt"].dt.to_period("M").dt.to_timestamp()
deal_meta["ccy_raw"] = deal_meta["reporting_currency_financials"].astype(str).str.strip()
deal_meta["ccy_use"] = deal_meta["ccy_raw"].str.upper().where(
    lambda s: s.str.upper().isin(curve_ccy_set),
    other="USD"  # default if currency column not in curve
)

# lookup base rate per deal: closest row within ±1 month; if none, mark missing-date
spread = 0.03
miss_ccy = (deal_meta["ccy_use"] != deal_meta["ccy_raw"].str.upper()).sum()
miss_date = 0

rates_sorted = rates.sort_values("DATE").reset_index(drop=True)

base_rates = []
for _, r in deal_meta.iterrows():
    emon = r["entry_month"]
    ccy  = r["ccy_use"]

    if pd.isna(emon):
        # no entry date -> cannot match a month
        base_rates.append(np.nan)
        miss_date += 1
        continue

    start = emon - pd.DateOffset(months=1)
    end   = emon + pd.DateOffset(months=1)
    window = rates_sorted[(rates_sorted["DATE"] >= start) & (rates_sorted["DATE"] <= end)]

    if window.empty or (ccy not in window.columns):
        base_rates.append(np.nan)
        miss_date += 1
        continue

    # pick the row with the smallest |date - entry_month|
    diffs = (window["DATE"] - emon).abs().dt.days
    j = diffs.idxmin()
    base = pd.to_numeric(window.loc[j, ccy], errors="coerce")
    base_rates.append(base)

deal_meta["base_rate"] = pd.to_numeric(pd.Series(base_rates, index=deal_meta.index), errors="coerce")
deal_meta["interest_rate_deal"] = deal_meta["base_rate"] + spread  # annual coupon level

# broadcast the per-deal rate to both rows
interest_rate = df["deal_id"].map(deal_meta.set_index("deal_id")["interest_rate_deal"].to_dict())

print(f"Interest rate assignment: {len(deal_meta) - int(miss_ccy)} deal(s) matched currency column; "
      f"defaulted to USD for {int(miss_ccy)} deal(s).")
print(f"Interest rate assignment: {len(deal_meta) - int(miss_date)} deal(s) matched entry month within ±1; "
      f"{int(miss_date)} deal(s) missing date match.")

# ============== 3) Holding period and compound total cost of debt ============
today = pd.Timestamp.today().normalize()
ent = pd.to_datetime(df.get("entry_date"), errors="coerce")
exi = pd.to_datetime(df.get("exit_date"),  errors="coerce")
is_exited = (df["holding_status"] == "exited")

days = pd.Series(np.nan, index=df.index, dtype=float)
days.loc[is_exited] = (exi - ent).dt.days.loc[is_exited]
days.loc[~is_exited] = (today - ent).dt.days.loc[~is_exited]
days = days.where(days >= 0)
holding_period = days / 365.25

# total holding-period cost: (1+r)^HP - 1
cost_of_debt = (1 + interest_rate) ** holding_period - 1

# write columns
df["ebitda_margin"]      = ebitda_margin
df["xebitda"]            = xebitda
df["de_ratio"]           = de_ratio
df["dividends"]          = dividends
df["capital_injections"] = capital_injections
df["interest_rate"]      = interest_rate
df["holding_period"]     = holding_period
df["cost_of_debt"]       = cost_of_debt

# clean helper cols
df = df.drop(columns=["_ref_dt", "_rank"])

df.to_csv(TARGET_CSV, index=False)

g = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})
new_cols = ["ebitda_margin","xebitda","de_ratio","dividends","capital_injections","interest_rate","holding_period","cost_of_debt"]
nn = {c: int(g[c].notna().sum()) for c in new_cols}
print("Added columns:", ", ".join(new_cols))
print("Non-null counts:", nn)

# ensure availability of by_status for the print (matches prior cells’ semantics)
by_status = (g.drop_duplicates(["deal_id","holding_status"])
               .groupby("holding_status")["deal_id"].nunique())
print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    g["deal_id"].nunique()
)

print("Interest Rate NAs: ", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"))["interest_rate"].isna().sum())

# ===================== 4) Drop deals with unreasonable financial metrics =====================
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)
before_deals = df["deal_id"].nunique()

em  = pd.to_numeric(df["ebitda_margin"], errors="coerce")
xe  = pd.to_numeric(df["xebitda"],        errors="coerce")
der = pd.to_numeric(df["de_ratio"],       errors="coerce")

# =================  Specify what deals will get dropped. ==================
# Example: xe < 0.1 | xe > 250 means that deals with xEBITDA <0.1 or >250 will be dropped.

v_em  = (em > 1)
abs_xe = xe.abs()
v_xe  = xe.isna() | (abs_xe < 0.1) | (abs_xe > 250)
v_der = der.isna() | (der < -1) | (der > 20)

bad_em_deals  = set(df.loc[v_em,  "deal_id"].dropna().unique())
bad_xe_deals  = set(df.loc[v_xe,  "deal_id"].dropna().unique())
bad_der_deals = set(df.loc[v_der, "deal_id"].dropna().unique())
bad_deals = bad_em_deals | bad_xe_deals | bad_der_deals

keep_mask = ~df["deal_id"].isin(bad_deals)
out = df.loc[keep_mask].reset_index(drop=True)
out.to_csv(TARGET_CSV, index=False)

after_rows  = len(out)
after_deals = out["deal_id"].nunique()
print(
    f"Dropped {before_deals - after_deals} deal_id(s). "
    f"[ebitda_margin>1: {len(bad_em_deals)}, "
    f"xebitda NaN/|xebitda|<0.1/|xebitda|>250: {len(bad_xe_deals)}, "
    f"de_ratio NaN/<-1/>20: {len(bad_der_deals)}]  Rows: {after_rows} (from {before_rows})."
)

# ---- checks ----
check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
em2  = pd.to_numeric(check["ebitda_margin"], errors="coerce")
xe2  = pd.to_numeric(check["xebitda"],        errors="coerce")
der2 = pd.to_numeric(check["de_ratio"],       errors="coerce")

assert not (em2 > 1).any(), "Remaining rows with ebitda_margin > 1."

abs_xe2 = xe2.abs()
assert not (xe2.isna() | (abs_xe2 < 0.1) | (abs_xe2 > 250)).any(), "Remaining rows with invalid xebitda."

assert not (der2.isna() | (der2 < -1) | (der2 > 20)).any(), "Remaining rows with invalid de_ratio."
rows_per_deal = check.groupby("deal_id")["id"].nunique()
assert (rows_per_deal == 2).all(), "Each remaining deal must have exactly 2 rows."

g = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})
by_status = (g.drop_duplicates(["deal_id","holding_status"])
               .groupby("holding_status")["deal_id"].nunique())
print(
    "unique_deals_exited:",   int(by_status.get("exited", 0)),
    "unique_deals_unexited:", int(by_status.get("unexited", 0)),
    "unique_deals_total:",    g["deal_id"].nunique()
)

#TODO: Count deals where both entry and exit ownership percentage are available
#TODO: Count deals where only one of both is available


Interest rate assignment: 277 deal(s) matched currency column; defaulted to USD for 15 deal(s).
Interest rate assignment: 292 deal(s) matched entry month within ±1; 0 deal(s) missing date match.
Added columns: ebitda_margin, xebitda, de_ratio, dividends, capital_injections, interest_rate, holding_period, cost_of_debt
Non-null counts: {'ebitda_margin': 584, 'xebitda': 584, 'de_ratio': 584, 'dividends': 584, 'capital_injections': 584, 'interest_rate': 584, 'holding_period': 584, 'cost_of_debt': 584}
unique_deals_exited: 164 unique_deals_unexited: 128 unique_deals_total: 292
Interest Rate NAs:  0
Dropped 5 deal_id(s). [ebitda_margin>1: 1, xebitda NaN/|xebitda|<0.1/|xebitda|>250: 4, de_ratio NaN/<-1/>20: 1]  Rows: 574 (from 584).
unique_deals_exited: 162 unique_deals_unexited: 125 unique_deals_total: 287
