In [None]:
from pathlib import Path

# ----- Read Excel file ----- #

REL_PATH = Path("InputData/CoreData.xlsx")

def find_upwards(rel_path: Path, max_up: int = 8) -> Path:
    """
    Starting at cwd, walk up to `max_up` parents to find `rel_path`.
    Returns the resolved path if found; raises FileNotFoundError otherwise.
    """
    here = Path.cwd()
    for parent in [here, *here.parents][: max_up + 1]:
        candidate = (parent / rel_path)
        if candidate.exists():
            return candidate.resolve()
    # Helpful diagnostics
    raise FileNotFoundError(
        f"Couldn't locate '{rel_path.as_posix()}' from {here} by walking up {max_up} levels.\n"
        f"- Current working directory: {here}\n"
        f"- Checked: {[str((p / rel_path)) for p in [here, *here.parents][: max_up + 1]]}"
    )

INPUT_XLSX = find_upwards(REL_PATH)

import pandas as pd
xfile = pd.ExcelFile(INPUT_XLSX)
sheets = xfile.sheet_names
print("Resolved path:", INPUT_XLSX)
print("Sheets:", sheets)

assert isinstance(sheets, list), "Expected a list"
assert sheets and all(isinstance(s, str) and s.strip() for s in sheets), "Sheet names must be non-empty strings"
assert len(sheets) == len(set(sheets)), "Duplicate sheet names detected"
print("Check 1 passed.")


In [None]:
import pandas as pd

#----- Quickly check whether also the sheets get read correctly -----#

SHEET = "deal_time_series"

# Guard: make sure the sheet exists (uses `sheets` from Step 1)
assert SHEET in sheets, f"'{SHEET}' not found. Available sheets: {sheets}"

# Read with first row as header
dts = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, header=0)

# Display column names and first 10 data rows (i.e., Excel rows 2–11)
print("Column names:", list(dts.columns))
display(dts.head(5))

# --- Check 2
assert isinstance(dts, pd.DataFrame), "Expected a pandas DataFrame."
assert not dts.empty, "Sheet loaded but contains no data."
assert all(isinstance(c, str) and c.strip() for c in dts.columns), "Invalid/empty column names."
print(f"Check 2 passed. Shape: {dts.shape}. Showing 5 data rows above.")


In [None]:
from pathlib import Path
import pandas as pd

#----- Create output CSV and take the deal_time_series_id as starter column -----#

# Config
SHEET = "deal_time_series"
TARGET_DIR = (find_upwards(Path("ValueCreation")) / "Data")
TARGET_DIR.mkdir(parents=True, exist_ok=True)
TARGET_CSV = TARGET_DIR / "working.csv"

assert SHEET in sheets, f"Sheet '{SHEET}' not found."

# Load only the key column; no filtering, no sorting
usecols = ["id"]
raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET, usecols=usecols)

# Preserve order exactly as in the sheet
df = raw[["id"]]

# Persist
df.to_csv(TARGET_CSV, index=False)
print(f"Wrote {len(df):,} rows to {TARGET_CSV}")


import pandas as pd

assert TARGET_CSV.exists(), f"Missing output: {TARGET_CSV}"
check_df = pd.read_csv(TARGET_CSV)

# 1) Columns exactly as specified and in order
assert list(check_df.columns) == ["id"], list(check_df.columns)

# 2) Row count preserved
assert len(check_df) == len(raw), f"Row count changed: raw={len(raw)} vs written={len(check_df)}"

# 3) Order preserved: id sequence identical pre/post write
assert check_df["id"].tolist() == raw["id"].tolist(), "Row order changed."

# 4) Key integrity: non-null and unique
assert check_df["id"].notna().all(), "Null id found."
assert not check_df["id"].duplicated().any(), "Duplicate id values found."

print("INIT check passed. Shape:", check_df.shape)


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# ----- Add all needed columns from deal_time_series to the output CSV -----#

SHEET = "deal_time_series"
TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load working file and source
working = pd.read_csv(TARGET_CSV, dtype={"id": str})

requested = [
    "deal_id", "reference_date", "enterprise_value", "net_debt", "equity", "reporting_currency_financials", "reference_period_type_prefix", "reference_period_type_suffix", "revenue", "ebitda", "ownership_economic_percentage", "data_room_name",
]
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name=SHEET,
    usecols=["id", *requested],
    dtype={"id": str},
)

# Parse Excel-serial 'reference_date' to ISO yyyy-mm-dd (CSV-friendly)
if "reference_date" in src.columns:
    s = src["reference_date"]
    if np.issubdtype(s.dtype, np.number):
        dt = pd.to_datetime(s, unit="D", origin="1899-12-30", errors="coerce")
    else:
        dt = pd.to_datetime(s, errors="coerce")
    src["reference_date"] = dt.dt.strftime("%Y-%m-%d")

# Only add columns not already present
to_add = [c for c in requested if c not in working.columns]
src = src[["id", *to_add]]

# Preserve original row order
working["_ord"] = np.arange(len(working))

# Left-join on id
out = working.merge(src, on="id", how="left")

# Restore order and drop helper
out = out.sort_values("_ord").drop(columns="_ord")

# Save
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns: {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str})

# Row count preserved
assert len(after) == len(working), "Row count changed."

# Order preserved
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."

# Requested columns present
missing = [c for c in requested if c not in after.columns]
assert not missing, f"Missing columns: {missing}"

# reference_date parses or is blank
_ = pd.to_datetime(after["reference_date"], errors="coerce")
print("ADD_COLUMNS (deal_time_series extra fields) check passed. Shape:", after.shape)

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# ----- Add all needed columns from deal to the output CSV -----#

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")
working = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

requested = [
    "fund_id", "name", "entry_date", "entry_transaction_type",
    "sourcing_type", "investment_role", "exit_date", "exit_transaction_type",
]
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name="deal",
    usecols=["id", *requested],
    dtype={"id": str},
)

# Only add columns that aren't already present
to_add = [c for c in requested if c not in working.columns]
src = src[["id", *to_add]].rename(columns={"id": "deal_id"})

# Preserve original order
working["_ord"] = np.arange(len(working))

# Left-join on deal_id
out = working.merge(src, on="deal_id", how="left")

# Restore order and drop helper
out = out.sort_values("_ord").drop(columns="_ord")

# Save
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns from 'deal': {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})

# Row count preserved
assert len(after) == len(working), "Row count changed."

# Order preserved
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."

# Requested columns present
missing = [c for c in to_add if c not in after.columns]
assert not missing, f"Missing columns after merge: {missing}"

print("ADD_COLUMNS (deal) check passed. Shape:", after.shape)

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# ----- Add all needed columns from fund to the output CSV -----#

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

# Load working file
working = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str, "fund_id": str})

requested = ["name", "vintage_year", "investment_theme", "size", "fund_generation", "fund_family_generation"]
src = pd.read_excel(
    INPUT_XLSX,
    sheet_name="fund",
    usecols=["id", *requested],
    dtype={"id": str},
)

# Rename to avoid collision and to align join key
src = src.rename(columns={"id": "fund_id", "name": "fund_name"})

# Only add columns that aren't already present
to_add = [c if c != "name" else "fund_name" for c in requested]
to_add = [c for c in to_add if c not in working.columns]
src = src[["fund_id", *to_add]]

# Preserve original order
working["_ord"] = np.arange(len(working))

# Left join on fund_id
out = working.merge(src, on="fund_id", how="left")

# Restore order and drop helper
out = out.sort_values("_ord").drop(columns="_ord")

# Save
out.to_csv(TARGET_CSV, index=False)
print(f"Added columns from 'fund': {to_add}. Wrote {len(out):,} rows to {TARGET_CSV}.")

after = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str, "fund_id": str})

# Row count preserved
assert len(after) == len(working), "Row count changed."

# Order preserved
assert after["id"].tolist() == working.sort_values("_ord")["id"].tolist(), "Order changed."

# Requested columns (with rename) present
expected = [c if c != "name" else "fund_name" for c in ["name","vintage_year","investment_theme","size","fund_generation","fund_family_generation"]]
missing = [c for c in expected if c not in after.columns]
assert not missing, f"Missing columns after merge: {missing}"

print("ADD_COLUMNS (fund) check passed. Shape:", after.shape)

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())


In [None]:
from pathlib import Path
import pandas as pd

#----- Quick test to get all rows of one deal_id in the output CSV -----#

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

deal_id_value = "49328606-087c-4288-972a-614c19bd519e"
# "49328606-087c-4288-972a-614c19bd519e" #NA test DEAL
# "30f4104d-0343-4031-a729-ec81b646861a" Visma DEAL
df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
subset = df.loc[df["deal_id"] == deal_id_value]

print(f"Rows for deal_id={deal_id_value}: {len(subset)}")
display(subset)

assert not subset.empty, "No rows found for the specified deal_id."
print("Filter check passed.")

"""from pathlib import Path
import pandas as pd

TARGET_CSV = (find_upwards(Path("ValueCreation")) / "Data" / "working.csv")

name_value = "Visma DEAL"

df = pd.read_csv(TARGET_CSV, dtype={"id": str, "deal_id": str})
subset = df.loc[df["name"].astype(str).str.strip() == name_value]

print(f'Rows for name="{name_value}": {len(subset)}')
display(subset)

assert "name" in df.columns, "Column 'name' not found."
assert not subset.empty, f'No rows found for name="{name_value}".'
print("Filter check passed.")"""

print("unique_deals:", pd.read_csv((find_upwards(Path("ValueCreation")) / "Data" / "working.csv"), dtype={"deal_id": str})["deal_id"].nunique())
