In [7]:
from pathlib import Path
import pandas as pd

#----- Start the data transformation by adding "holding_status" -----#

def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    # Helpful diagnostics
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

before_rows = len(df)

# Normalize exit_date blanks to NA (handles "", "NaT" strings)
if "exit_date" not in df.columns:
    raise KeyError("Column 'exit_date' not found in working.csv. Run the earlier ADD_COLUMNS step from 'deal' first.")
norm = df["exit_date"].copy()
if norm.dtype == object:
    norm = norm.replace({"": pd.NA, "NaT": pd.NA, "nat": pd.NA, "None": pd.NA})

# Determine holding_status: any non-null exit_date => exited; else unexited
is_exited = norm.notna()
df["holding_status"] = is_exited.map({True: "exited", False: "unexited"})

# Persist
df.to_csv(TARGET_CSV, index=False)
print("Added column: holding_status")

import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"deal_id": str})

# Row-level coverage
exited_count = (check["holding_status"] == "exited").sum()
unexited_count = (check["holding_status"] == "unexited").sum()
total_after = len(check)
assert exited_count + unexited_count == total_after, "Status coverage failed: counts don't add up to total rows."

# Deal-level consistency: each deal_id should have a single status
status_per_deal = check.groupby("deal_id")["holding_status"].nunique(dropna=False)
mixed = status_per_deal[status_per_deal > 1]
assert mixed.empty, f"{len(mixed)} deal_id(s) have mixed exited/unexited rows."

print(f"Check passed. exited={exited_count}, unexited={unexited_count}, total_rows={total_after}")


Added column: holding_status
Check passed. exited=2511, unexited=2445, total_rows=4956


In [8]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import date

#----- Filter for unreasonable dates -----#
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)

# Parse dates (tolerant). Expect ISO strings like "YYYY-MM-DD" from earlier steps.
ref = pd.to_datetime(df["reference_date"], errors="coerce")
lower = pd.Timestamp(1980, 1, 1).normalize()
q_end = pd.Timestamp.today().to_period("Q-DEC").end_time.normalize()
date_ok = ref.isna() | ((ref >= lower) & (ref <= q_end))

# --- Revenue non-zero (treat non-numeric/NA as "keep")
rev_num = pd.to_numeric(df["revenue"], errors="coerce")
rev_ok = rev_num.fillna(np.inf) != 0

# Keep if reference_date is NA OR within [lower, q_end]; drop otherwise.
keep_mask = date_ok & rev_ok

# Preserve original order
df["_ord"] = np.arange(len(df))
after = df.loc[keep_mask].sort_values("_ord").drop(columns="_ord").reset_index(drop=True)

dropped = before_rows - len(after)
print(f"Dropped {dropped} rows outside [{lower.date()} .. {q_end.date()}] or with revenue == 0.")

# Save
after.to_csv(TARGET_CSV, index=False)

import pandas as pd

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
ref2 = pd.to_datetime(check["reference_date"], errors="coerce")

# 1) Dates within bounds (for non-null)
assert ((ref2.dropna() >= lower) & (ref2.dropna() <= q_end)).all(), "Found dates outside bounds."

# 2) No revenue == 0
rev2 = pd.to_numeric(check["revenue"], errors="coerce")
assert not (rev2 == 0).any(), "Found rows with revenue == 0."

print(f"FILTER check passed. Remaining rows: {len(check)}")


Dropped 31 rows outside [1980-01-01 .. 2025-12-31] or with revenue == 0.
FILTER check passed. Remaining rows: 4925


In [9]:
from pathlib import Path
import pandas as pd
import numpy as np

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
before_rows = len(df)

# Temporary numeric views for presence checks (does not change df)
def num(s):
    return pd.to_numeric(s, errors="coerce")

rev_num   = num(df["revenue"])
ebitda_num= num(df["ebitda"])
ev_num    = num(df["enterprise_value"])
nd_num    = num(df["net_debt"])
eq_num    = num(df["equity"])

# Presence logic
rev_ok    = rev_num.notna()
ebitda_ok = ebitda_num.notna()
trio_non_null = ev_num.notna().astype(int) + nd_num.notna().astype(int) + eq_num.notna().astype(int)
trio_ok   = trio_non_null >= 2

keep_mask = rev_ok & ebitda_ok & trio_ok

# Preserve original order exactly
df["_ord"] = np.arange(len(df))
after = df.loc[keep_mask].sort_values("_ord").drop(columns="_ord").reset_index(drop=True)

dropped = before_rows - len(after)
print(f"Filtering out {dropped} rows (kept {len(after)} of {before_rows}).")

# Save
after.to_csv(TARGET_CSV, index=False)

import pandas as pd
import numpy as np

check = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
# Recompute numeric presence on the saved data for verification
rev_num   = pd.to_numeric(check["revenue"], errors="coerce")
ebitda_num= pd.to_numeric(check["ebitda"], errors="coerce")
ev_num    = pd.to_numeric(check["enterprise_value"], errors="coerce")
nd_num    = pd.to_numeric(check["net_debt"], errors="coerce")
eq_num    = pd.to_numeric(check["equity"], errors="coerce")

assert rev_num.notna().all(), "Found rows with empty revenue after filtering."
assert ebitda_num.notna().all(), "Found rows with empty ebitda after filtering."
assert ((ev_num.notna().astype(int) + nd_num.notna().astype(int) + eq_num.notna().astype(int)) >= 2).all(), \
       "Found rows with fewer than two of [enterprise_value, net_debt, equity] present."

print("FILTER check passed. Shape:", check.shape)


Filtering out 2760 rows (kept 2165 of 4925).
FILTER check passed. Shape: (2165, 28)
