In [16]:
from pathlib import Path
import pandas as pd
import numpy as np
from IPython.display import display

# ---------- Helpers ---------- #
REL_PATH = Path("InputData/CoreData.xlsx")

def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    """
    Starting at cwd, walk up to `max_up` parents to find `rel_path`.
    Returns the resolved path if found; raises FileNotFoundError otherwise.
    """
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

# ---------- Step 1: Read Excel / basic checks ---------- #
INPUT_XLSX = find_upwards(REL_PATH)
xfile = pd.ExcelFile(INPUT_XLSX)
sheets = xfile.sheet_names
print("Resolved path:", INPUT_XLSX)
print("Sheets:", sheets)

assert isinstance(sheets, list), "Expected a list"
assert sheets and all(isinstance(s, str) and s.strip() for s in sheets), "Sheet names must be non-empty strings"
assert len(sheets) == len(set(sheets)), "Duplicate sheet names detected"
print("Check 1 passed.")

# Quick read of key sheet + preview
SHEET = "deal_time_series"
assert SHEET in sheets, f"'{SHEET}' not found. Available sheets: {sheets}"
dts = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, header=0)
print("Column names:", list(dts.columns))
display(dts.head(5))
assert isinstance(dts, pd.DataFrame), "Expected a pandas DataFrame."
assert not dts.empty, "Sheet loaded but contains no data."
assert all(isinstance(c, str) and c.strip() for c in dts.columns), "Invalid/empty column names."
print(f"Check 2 passed. Shape: {dts.shape}. Showing 5 data rows above.")

# ---------- Step 2: Init working.csv with id ---------- #
TARGET_DIR = (find_upwards(Path("ValueCreation")) / "Data")
TARGET_DIR.mkdir(parents=True, exist_ok=True)
TARGET_CSV = TARGET_DIR / "working.csv"

assert SHEET in sheets, f"Sheet '{SHEET}' not found."
usecols = ["id"]
raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, usecols=usecols)
df = raw[["id"]]
df.to_csv(TARGET_CSV, index=False)
print(f"Wrote {len(df):,} rows to {TARGET_CSV}")

# Post-write checks
assert TARGET_CSV.exists(), f"Missing output: {TARGET_CSV}"
check_df = pd.read_csv(TARGET_CSV)
assert list(check_df.columns) == ["id"], list(check_df.columns)
assert len(check_df) == len(raw), f"Row count changed: raw={len(raw)} vs written={len(check_df)}"
assert check_df["id"].tolist() == raw["id"].tolist(), "Row order changed."
assert check_df["id"].notna().all(), "Null id found."
assert not check_df["id"].duplicated().any(), "Duplicate id values found."
print("INIT check passed. Shape:", check_df.shape)

# ---------- Step 3: Add columns from deal_time_series ---------- #
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
working = pd.read_csv(TARGET_CSV, dtype={"id": str})

requested = [
    "deal_id", "reference_date", "enterprise_value", "net_debt", "equity",
    "reporting_currency_financials", "reference_period_type_prefix",
    "reference_period_type_suffix", "revenue", "ebitda",
    "ownership_economic_percentage", "data_room_name",
]
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name=SHEET,
    usecols=["id", *requested],
    dtype={"id": str},
)

# Normalize reference_date to yyyy-mm-dd (handles Excel serials and strings)
if "reference_date" in src.columns:
    s = src["reference_date"]
    if np.issubdtype(s.dtype, np.number):
        dt = pd.to_datetime(s, unit="D", origin="1899-12-30", errors="coerce")
    else:
        dt = pd.to_datetime(s, errors="coerce")
    src["reference_date"] = dt.dt.strftime("%Y-%m-%d")

to_add = [c for c in requested if c not in working.columns]
src = src[["id", *to_add]]

working["_ord"] = np.arange(len(working))
out = working.merge(src, on="id", how="left")
out = out.sort_values("_ord").drop(columns="_ord")
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns: {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str})
assert len(after) == len(working), "Row count changed."
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."
missing = [c for c in requested if c not in after.columns]
assert not missing, f"Missing columns: {missing}"
_ = pd.to_datetime(after["reference_date"], errors="coerce")
print("ADD_COLUMNS (deal_time_series extra fields) check passed. Shape:", after.shape)
print("unique_deals:", pd.read_csv(TARGET_CSV, dtype={"deal_id": str})["deal_id"].nunique())

# ---------- Step 4: Add columns from deal ---------- #
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
working = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

requested = [
    "fund_id", "name", "entry_date", "entry_transaction_type",
    "sourcing_type", "investment_role", "exit_date", "exit_transaction_type",
]
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name="deal",
    usecols=["id", *requested],
    dtype={"id": str},
)
to_add = [c for c in requested if c not in working.columns]
src = src[["id", *to_add]].rename(columns={"id": "deal_id"})

working["_ord"] = np.arange(len(working))
out = working.merge(src, on="deal_id", how="left")
out = out.sort_values("_ord").drop(columns="_ord")
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns from 'deal': {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
assert len(after) == len(working), "Row count changed."
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."
missing = [c for c in to_add if c not in after.columns]
assert not missing, f"Missing columns after merge: {missing}"
print("ADD_COLUMNS (deal) check passed. Shape:", after.shape)
print("unique_deals:", pd.read_csv(TARGET_CSV, dtype={"deal_id": str})["deal_id"].nunique())

# ---------- Step 5: Add columns from fund ---------- #
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
working = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str, "fund_id": str})

requested = ["name", "vintage_year", "investment_theme", "size", "fund_generation", "fund_family_generation"]
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name="fund",
    usecols=["id", *requested],
    dtype={"id": str},
)
src = src.rename(columns={"id": "fund_id", "name": "fund_name"})
to_add = [c if c != "name" else "fund_name" for c in requested]
to_add = [c for c in to_add if c not in working.columns]
src = src[["fund_id", *to_add]]

working["_ord"] = np.arange(len(working))
out = working.merge(src, on="fund_id", how="left")
out = out.sort_values("_ord").drop(columns="_ord")
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns from 'fund': {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str, "fund_id": str})
assert len(after) == len(working), "Row count changed."
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."
expected = [c if c != "name" else "fund_name" for c in ["name","vintage_year","investment_theme","size","fund_generation","fund_family_generation"]]
missing = [c for c in expected if c not in after.columns]
assert not missing, f"Missing columns after merge: {missing}"
print("ADD_COLUMNS (fund) check passed. Shape:", after.shape)
print("unique_deals:", pd.read_csv(TARGET_CSV, dtype={"deal_id": str})["deal_id"].nunique())


Resolved path: /Users/michael/Library/Mobile Documents/com~apple~CloudDocs/Studium TUM/Master Management and Technology/06 Master Thesis/00 Thesis/05Code/InputData/CoreData.xlsx
Sheets: ['Metadata', 'dashboard', 'general_partner', 'fund', 'fund_cash_flow', 'capital_account', 'deal', 'deal_time_series', 'deal_cash_flow', 'deal_partner', 'deal_acquirer', 'deal_vendor', 'organization', 'person']
Check 1 passed.
Column names: ['id', 'total_value', 'ebitda', 'reference_period_type_suffix', 'moic_gross', 'data_room_id', 'created_by_user_id', 'recurring_revenue', 'bridge_financing', 'reporting_currency_financials', 'irr_net', 'reference_period_type_prefix', 'moic_net', 'data_room_name', 'realized_value', 'irr_gross', 'ebitda_adjusted', 'net_debt', 'ebitda_multiple', 'enterprise_value_valuation_rationale', 'is_main', 'equity', 'reporting_currency_valuation', 'management_equity_percentage', 'revenue_multiple', 'recurring_revenue_percentage', 'quarterly_company_update', '_created_at_utc', 'enter

Unnamed: 0,id,total_value,ebitda,reference_period_type_suffix,moic_gross,data_room_id,created_by_user_id,recurring_revenue,bridge_financing,reporting_currency_financials,...,ebitda_adjusted_note,ebitda_margin,cumulative_addons,moic_net_unlevered,revenue,fund_equity_invested,_quarter,_revision_id,ownership_economic_percentage,enterprise_value_valuation_amount
0,111dts1,,10.0,,,,,,,USD,...,,,,,100.0,,,,1.0,
1,111dts2,,15.0,,,,,,,USD,...,,,,,120.0,,,,1.0,
2,b0cd7032-72f6-46d0-ae21-7a0ca81297eb,,,Actual,,203ffba5-3ebb-454a-844c-87cee656bd95,25bd1583-7869-465c-9dc4-664685cd3a6c,,,USD,...,,,,,,,1.0,cccb3423-eb71-452a-92f1-3b4a64100646,,
3,997bb98e-9ab1-47be-b04b-767d225f60a9,,7600000.0,Actual,,d40592d4-9127-4e77-a8c7-9da4755a6105,25bd1583-7869-465c-9dc4-664685cd3a6c,,,EUR,...,,0.046,,,164000000.0,,1.0,64ef422a-f3cc-44fe-bf2b-fe5955950008,,
4,1eab0e13-6d1d-4c99-a8a0-6c32b56de012,,34000000.0,Actual,,d40592d4-9127-4e77-a8c7-9da4755a6105,25bd1583-7869-465c-9dc4-664685cd3a6c,,,EUR,...,,0.121,,,280000000.0,,2.0,3e0b9962-f308-420f-9c62-5fa24f5b2e7e,,


Check 2 passed. Shape: (4958, 51). Showing 5 data rows above.
Wrote 4,958 rows to /Users/michael/Library/Mobile Documents/com~apple~CloudDocs/Studium TUM/Master Management and Technology/06 Master Thesis/00 Thesis/05Code/ValueCreation/Data/working.csv
INIT check passed. Shape: (4958, 1)
Added columns: ['deal_id', 'reference_date', 'enterprise_value', 'net_debt', 'equity', 'reporting_currency_financials', 'reference_period_type_prefix', 'reference_period_type_suffix', 'revenue', 'ebitda', 'ownership_economic_percentage', 'data_room_name']. Wrote 4,958 rows to /Users/michael/Library/Mobile Documents/com~apple~CloudDocs/Studium TUM/Master Management and Technology/06 Master Thesis/00 Thesis/05Code/ValueCreation/Data/working.csv.
ADD_COLUMNS (deal_time_series extra fields) check passed. Shape: (4958, 13)
unique_deals: 1165
Added columns from 'deal': ['fund_id', 'name', 'entry_date', 'entry_transaction_type', 'sourcing_type', 'investment_role', 'exit_date', 'exit_transaction_type']. Wrote 4