In [29]:
from pathlib import Path

# ----- Read Excel file ----- #
REL_PATH = Path("InputData/CoreData.xlsx")

def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    """
    Starting at cwd, walk up to `max_up` parents to find `rel_path`.
    Returns the resolved path if found; raises FileNotFoundError otherwise.
    """
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    # Helpful diagnostics
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

INPUT_XLSX = find_upwards(REL_PATH)

import pandas as pd
xfile = pd.ExcelFile(INPUT_XLSX)
sheets = xfile.sheet_names
print("Resolved path:", INPUT_XLSX)
print("Sheets:", sheets)

assert isinstance(sheets, list), "Expected a list"
assert sheets and all(isinstance(s, str) and s.strip() for s in sheets), "Sheet names must be non-empty strings"
assert len(sheets) == len(set(sheets)), "Duplicate sheet names detected"
print("Check 1 passed.")


Resolved path: /Users/michael/Library/Mobile Documents/com~apple~CloudDocs/Studium TUM/Master Management and Technology/06 Master Thesis/00 Thesis/05Code/InputData/CoreData.xlsx
Sheets: ['Metadata', 'dashboard', 'general_partner', 'fund', 'fund_cash_flow', 'capital_account', 'deal', 'deal_time_series', 'deal_cash_flow', 'deal_partner', 'deal_acquirer', 'deal_vendor', 'organization', 'person']
Check 1 passed.


In [30]:
import pandas as pd

SHEET = "deal_time_series"

# Guard: make sure the sheet exists (uses `sheets` from Step 1)
assert SHEET in sheets, f"'{SHEET}' not found. Available sheets: {sheets}"

# Read with first row as header
dts = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, header=0)

# Display column names and first 10 data rows (i.e., Excel rows 2–11)
print("Column names:", list(dts.columns))
display(dts.head(5))

# --- Check 2
assert isinstance(dts, pd.DataFrame), "Expected a pandas DataFrame."
assert not dts.empty, "Sheet loaded but contains no data."
assert all(isinstance(c, str) and c.strip() for c in dts.columns), "Invalid/empty column names."
print(f"Check 2 passed. Shape: {dts.shape}. Showing 5 data rows above.")


Column names: ['id', 'total_value', 'ebitda', 'reference_period_type_suffix', 'moic_gross', 'data_room_id', 'created_by_user_id', 'recurring_revenue', 'bridge_financing', 'reporting_currency_financials', 'irr_net', 'reference_period_type_prefix', 'moic_net', 'data_room_name', 'realized_value', 'irr_gross', 'ebitda_adjusted', 'net_debt', 'ebitda_multiple', 'enterprise_value_valuation_rationale', 'is_main', 'equity', 'reporting_currency_valuation', 'management_equity_percentage', 'revenue_multiple', 'recurring_revenue_percentage', 'quarterly_company_update', '_created_at_utc', 'enterprise_value', 'enterprise_value_valuation_multiple', '_year', 'reference_date', 'irr_net_unlevered', 'capex', 'total_investment_cost', 'deal_revision_id', 'reported_date', 'deal_id', 'unrealized_value', 'predicted_sentiment', 'ebit', 'ebitda_adjusted_note', 'ebitda_margin', 'cumulative_addons', 'moic_net_unlevered', 'revenue', 'fund_equity_invested', '_quarter', '_revision_id', 'ownership_economic_percentage'

Unnamed: 0,id,total_value,ebitda,reference_period_type_suffix,moic_gross,data_room_id,created_by_user_id,recurring_revenue,bridge_financing,reporting_currency_financials,...,ebitda_adjusted_note,ebitda_margin,cumulative_addons,moic_net_unlevered,revenue,fund_equity_invested,_quarter,_revision_id,ownership_economic_percentage,enterprise_value_valuation_amount
0,b0cd7032-72f6-46d0-ae21-7a0ca81297eb,,,Actual,,203ffba5-3ebb-454a-844c-87cee656bd95,25bd1583-7869-465c-9dc4-664685cd3a6c,,,USD,...,,,,,,,1,cccb3423-eb71-452a-92f1-3b4a64100646,,
1,997bb98e-9ab1-47be-b04b-767d225f60a9,,7600000.0,Actual,,d40592d4-9127-4e77-a8c7-9da4755a6105,25bd1583-7869-465c-9dc4-664685cd3a6c,,,EUR,...,,0.046,,,164000000.0,,1,64ef422a-f3cc-44fe-bf2b-fe5955950008,,
2,1eab0e13-6d1d-4c99-a8a0-6c32b56de012,,34000000.0,Actual,,d40592d4-9127-4e77-a8c7-9da4755a6105,25bd1583-7869-465c-9dc4-664685cd3a6c,,,EUR,...,,0.121,,,280000000.0,,2,3e0b9962-f308-420f-9c62-5fa24f5b2e7e,,
3,9ae2e65b-ec9d-4215-8982-f39658e9fa1e,19839000.0,13747260.0,,,9d92005d-7097-4949-88ea-8eb6ff688a86,38ed8bb8-d707-4652-94c0-6a094d116b50,,,EUR,...,,,,,48663900.0,19839000.0,3,9ae2e65b-ec9d-4215-8982-f39658e9fa1e,1.0,
4,fa677592-875d-413e-91fe-c8af1dd99f63,40736000.0,18894000.0,,,9d92005d-7097-4949-88ea-8eb6ff688a86,38ed8bb8-d707-4652-94c0-6a094d116b50,,,EUR,...,,,,,393249800.0,40736000.0,2,fa677592-875d-413e-91fe-c8af1dd99f63,0.5,


Check 2 passed. Shape: (4956, 51). Showing 5 data rows above.


In [31]:
from pathlib import Path
import pandas as pd

# --- Config
SHEET = "deal_time_series"
TARGET_DIR = (find_upwards(Path("ValueCreation")) / "Data")
TARGET_DIR.mkdir(parents=True, exist_ok=True)
TARGET_CSV = TARGET_DIR / "working.csv"

assert SHEET in sheets, f"Sheet '{SHEET}' not found."

# Load only the key column; no filtering, no sorting
usecols = ["id"]
raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, usecols=usecols)

# Preserve order exactly as in the sheet
df = raw[["id"]]

# Persist
df.to_csv(TARGET_CSV, index=False)
print(f"Wrote {len(df):,} rows to {TARGET_CSV}")


import pandas as pd

assert TARGET_CSV.exists(), f"Missing output: {TARGET_CSV}"
check_df = pd.read_csv(TARGET_CSV)

# 1) Columns exactly as specified and in order
assert list(check_df.columns) == ["id"], list(check_df.columns)

# 2) Row count preserved
assert len(check_df) == len(raw), f"Row count changed: raw={len(raw)} vs written={len(check_df)}"

# 3) Order preserved: id sequence identical pre/post write
assert check_df["id"].tolist() == raw["id"].tolist(), "Row order changed."

# 4) Key integrity: non-null and unique
assert check_df["id"].notna().all(), "Null id found."
assert not check_df["id"].duplicated().any(), "Duplicate id values found."

print("INIT check passed. Shape:", check_df.shape)


Wrote 4,956 rows to /Users/michael/Library/Mobile Documents/com~apple~CloudDocs/Studium TUM/Master Management and Technology/06 Master Thesis/00 Thesis/05Code/ValueCreation/Data/working.csv
INIT check passed. Shape: (4956, 1)


In [32]:
from pathlib import Path
import pandas as pd
import numpy as np

# --- Config
SHEET = "deal_time_series"
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# --- Load current working file and the source sheet
working = pd.read_csv(TARGET_CSV, dtype={"id": str})

requested = [
    "deal_id", "reference_date", "enterprise_value", "net_debt", "equity",
    "revenue", "ebitda", "ownership_economic_percentage", "data_room_name"
]
src = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, usecols=["id", *requested], dtype={"id": str})

# Parse Excel-serial 'reference_date' to ISO yyyy-mm-dd (CSV-friendly)
if "reference_date" in src.columns:
    s = src["reference_date"]
    if np.issubdtype(s.dtype, np.number):
        dt = pd.to_datetime(s, unit="D", origin="1899-12-30", errors="coerce")
    else:
        dt = pd.to_datetime(s, errors="coerce")
    src["reference_date"] = dt.dt.strftime("%Y-%m-%d")

# Only add columns that aren't already present
to_add = [c for c in requested if c not in working.columns]
src = src[["id", *to_add]]

# Preserve original row order
working["_ord"] = np.arange(len(working))

# Left-join on string id
out = working.merge(src, on="id", how="left")

# Restore order and drop helper
out = out.sort_values("_ord").drop(columns="_ord")

# Save
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns: {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")


Added columns: ['deal_id', 'reference_date', 'enterprise_value', 'net_debt', 'equity', 'revenue', 'ebitda', 'ownership_economic_percentage', 'data_room_name']. Wrote 4,956 rows to /Users/michael/Library/Mobile Documents/com~apple~CloudDocs/Studium TUM/Master Management and Technology/06 Master Thesis/00 Thesis/05Code/ValueCreation/Data/working.csv.
ADD_COLUMNS check passed. Shape: (4956, 10)
