In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from IPython.display import display

# ---------- Helpers ---------- #
REL_PATH = Path("InputData/CoreData.xlsx")

def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

# ---------- Step 1: Read Excel / basic checks ---------- #
INPUT_XLSX = find_upwards(REL_PATH)
xfile = pd.ExcelFile(INPUT_XLSX)
sheets = xfile.sheet_names
print("Resolved path:", INPUT_XLSX)
print("Sheets:", sheets)

assert isinstance(sheets, list), "Expected a list"
assert sheets and all(isinstance(s, str) and s.strip() for s in sheets), "Sheet names must be non-empty strings"
assert len(sheets) == len(set(sheets)), "Duplicate sheet names detected"
print("Check 1 passed.")

SHEET = "deal_time_series"
assert SHEET in sheets, f"'{SHEET}' not found. Available sheets: {sheets}"
dts = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, header=0)
print("Column names:", list(dts.columns))
assert isinstance(dts, pd.DataFrame), "Expected a pandas DataFrame."
assert not dts.empty, "Sheet loaded but contains no data."
assert all(isinstance(c, str) and c.strip() for c in dts.columns), "Invalid/empty column names."
print(f"Check 2 passed. Shape: {dts.shape}. Showing 5 data rows above.")

# ---------- Step 2: Init working.csv with id ---------- #
TARGET_DIR = (find_upwards(Path("ValueCreation")) / "Data")
TARGET_DIR.mkdir(parents=True, exist_ok=True)
TARGET_CSV = TARGET_DIR / "working.csv"

usecols = ["id"]
raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, usecols=usecols)
df = raw[["id"]]
df.to_csv(TARGET_CSV, index=False)
print(f"Wrote {len(df):,} rows to {TARGET_CSV}")

assert TARGET_CSV.exists(), f"Missing output: {TARGET_CSV}"
check_df = pd.read_csv(TARGET_CSV)
assert list(check_df.columns) == ["id"], list(check_df.columns)
assert len(check_df) == len(raw), f"Row count changed: raw={len(raw)} vs written={len(check_df)}"
assert check_df["id"].tolist() == raw["id"].tolist(), "Row order changed."
assert check_df["id"].notna().all(), "Null id found."
assert not check_df["id"].duplicated().any(), "Duplicate id values found."
print("INIT check passed. Shape:", check_df.shape)

# ---------- Step 3: Add columns from deal_time_series ---------- #
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
working = pd.read_csv(TARGET_CSV, dtype={"id": str})

requested = [
    "deal_id", "reference_date", "enterprise_value", "net_debt", "equity",
    "reporting_currency_financials", "reference_period_type_prefix",
    "reference_period_type_suffix", "revenue", "ebitda",
    "ownership_economic_percentage",
]
# "data_room_name",
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name=SHEET,
    usecols=["id", *requested],
    dtype={"id": str},
)
assert src["id"].is_unique, "deal_time_series: duplicate id values would explode rows on merge."

# Normalize reference_date robustly
if "reference_date" in src.columns:
    s = src["reference_date"]
    dt = pd.to_datetime(s, errors="coerce")
    ser = pd.to_numeric(s, errors="coerce")
    is_serialish = (dt.isna() & ser.gt(20000)).mean() > 0.5
    if is_serialish:
        dt = pd.to_datetime(ser, unit="D", origin="1899-12-30", errors="coerce")
    src["reference_date"] = dt.dt.strftime("%Y-%m-%d")

to_add = [c for c in requested if c not in working.columns]
src = src[["id", *to_add]]

working["_ord"] = np.arange(len(working))
out = working.merge(src, on="id", how="left")
out = out.sort_values("_ord").drop(columns="_ord")
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns: {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str})
assert len(after) == len(working), "Row count changed."
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."
missing = [c for c in requested if c not in after.columns]
assert not missing, f"Missing columns: {missing}"
_ = pd.to_datetime(after["reference_date"], errors="coerce")
print("ADD_COLUMNS (deal_time_series extra fields) check passed. Shape:", after.shape)
print("unique_deals:", pd.read_csv(TARGET_CSV, dtype={"deal_id": str})["deal_id"].nunique())

# ---------- Step 4: Add columns from deal ---------- #
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
working = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

requested = [
    "name", "entry_date", "sourcing_type", "entry_transaction_type",
     "investment_role", "exit_date", "exit_transaction_type", "fund_id",
]
#
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name="deal",
    usecols=["id", *requested],
    dtype={"id": str},
).rename(columns={"id": "deal_id"})
assert src["deal_id"].is_unique, "deal: duplicate deal_id values would explode rows on merge."

to_add = [c for c in requested if c not in working.columns]
# IMPORTANT: select with 'deal_id' (already renamed), not 'id'
src = src[["deal_id", *to_add]]

working["_ord"] = np.arange(len(working))
out = working.merge(src, on="deal_id", how="left")
out = out.sort_values("_ord").drop(columns="_ord")
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns from 'deal': {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
assert len(after) == len(working), "Row count changed."
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."
missing = [c for c in to_add if c not in after.columns]
assert not missing, f"Missing columns after merge: {missing}"
print("ADD_COLUMNS (deal) check passed. Shape:", after.shape)
print("unique_deals:", pd.read_csv(TARGET_CSV, dtype={"deal_id": str})["deal_id"].nunique())

# ---------- Step 5: Add columns from fund ---------- #
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
working = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str, "fund_id": str})

requested = ["name", "investment_theme", "vintage_year", "size", "fund_generation", "fund_family_generation"]
#
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name="fund",
    usecols=["id", *requested],
    dtype={"id": str},
).rename(columns={"id": "fund_id", "name": "fund_name"})
assert src["fund_id"].is_unique, "fund: duplicate fund_id values would explode rows on merge."

to_add = [c if c != "name" else "fund_name" for c in requested]
to_add = [c for c in to_add if c not in working.columns]
src = src[["fund_id", *to_add]]

working["_ord"] = np.arange(len(working))
out = working.merge(src, on="fund_id", how="left")
out = out.sort_values("_ord").drop(columns="_ord")
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns from 'fund': {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str, "fund_id": str})
assert len(after) == len(working), "Row count changed."
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."
expected = [c if c != "name" else "fund_name" for c in ["name","vintage_year","investment_theme","size","fund_generation","fund_family_generation"]]
missing = [c for c in expected if c not in after.columns]
print("ADD_COLUMNS (fund) check passed. Shape:", after.shape)
print("unique_deals:", pd.read_csv(TARGET_CSV, dtype={"deal_id": str})["deal_id"].nunique())


Resolved path: /Users/michael/Library/Mobile Documents/com~apple~CloudDocs/Studium TUM/Master Management and Technology/06 Master Thesis/00 Thesis/05Code/InputData/CoreData.xlsx
Sheets: ['general_partner', 'fund', 'fund_cash_flow', 'capital_account', 'deal', 'deal_time_series', 'deal_cash_flow', 'deal_partner', 'deal_acquirer', 'deal_vendor', 'organization', 'person']
Check 1 passed.
Column names: ['id', 'deal_revision_id', 'enterprise_value', 'equity', 'net_debt', 'revenue', 'ebit', 'ebitda', 'capex', 'ebitda_multiple', 'unrealized_value', 'realized_value', 'total_value', 'total_investment_cost', 'irr_gross', 'bridge_financing', 'ownership_economic_percentage', 'reporting_currency_valuation', 'reporting_currency_financials', 'reported_date', 'reference_period_type_prefix', 'reference_date', 'reference_period_type_suffix', 'quarterly_company_update', '_year', '_quarter', 'predicted_sentiment', 'enterprise_value_valuation_rationale', 'enterprise_value_valuation_multiple', 'enterprise_va