In [57]:
import pandas as pd
import numpy as np

EXCEL_ERRORS = {
    "#NOME?", "#DIV/0!", "#VALUE!", "#REF!",
    "#NUM!", "#N/A", "#NULL!", "=-inf"
}

# Load raw dataset
df = pd.read_csv(
    "datasets/MOU2D.csv",
    sep=",",
    engine="python"
)

# Clean column names
df.columns = df.columns.str.strip()

print("Columns:", df.columns.tolist())

# ---- DEFINE OBJECTIVE COLUMNS (FIX THIS IF NEEDED) ----
obj_cols = df.columns[:3].tolist()
print("Using objectives:", obj_cols)

# Force objectives to string
df[obj_cols] = df[obj_cols].astype(str)

# Robust Excel error + infinity cleaning
df[obj_cols] = df[obj_cols].replace(
    to_replace=[
        r"^\s*=\s*[-+]?inf(inity)?\s*$",
        r"^\s*[-+]?inf(inity)?\s*$",
        r"^\s*#.*\s*$"
    ],
    value=np.nan,
    regex=True
)

# Convert to numeric
df[obj_cols] = df[obj_cols].apply(pd.to_numeric, errors="coerce")

# Drop invalid rows
before = len(df)
df_clean = df.dropna(subset=obj_cols)
after = len(df_clean)

print(f"Removed {before - after} invalid rows")
print("Final shape:", df_clean.shape)

# Save cleaned dataset
df_clean.to_csv(
    "datasets/MOU2D_cleaned.csv",
    sep=";",
    decimal=".",
    index=False
)


Columns: ['COP', 'CWP', 'NPV', 'nWWcat', 'nWPr05', 'nWPr11', 'nWPr12', 'nWPr13', 'nWPr18', 'nWPr19', 'nWPr20', 'nWPr24', 'nWPr25', 'nWPr26', 'nWPr27', 'nWPr29', 'nWPr30', 'nWPr31', 'nWPr32', 'nWPr33', 'nWPr34', 'nWPr35', 'nWPr36', 'nWPr37', 'nWPr38', 'nWPr39', 'nWPr40', 'nWPr41', 'nWPr42', 'nWPr43', 'nWPr44', 'nWPr45', 'nWPr46', 'nWPr47', 'nWPr48', 'nWPr49', 'nWPr50', 'nWPr51', 'nWPr52', 'nWPr53', 'nWPr54', 'nWPr55', 'nWPr56', 'nWPr57', 'nWPr58', 'nWPr59', 'nWPr60', 'nWPr61', 'nWPr62', 'nWPr63', 'nWPr64', 'nWPr65', 'nWPr66', 'nWPr67', 'nWPr68', 'nWPr69', 'nWPr70', 'nWPr72', 'nWPr73', 'nWPr74', 'nWPr75', 'nWPr76', 'nWPr79', 'nWPr80', 'nWPr81', 'nWPr82', 'nWPr83', 'nWPr84', 'nWIn04', 'nWIn05', 'nWIn10', 'nWIn11', 'nWIn12', 'nWIn13', 'nWIn17', 'nWIn18', 'nWIn19', 'nWIn20', 'nWIn24', 'nWIn25', 'nWIn26', 'nWIn27', 'nWIn29', 'nWIn30', 'nWIn31', 'nWIn32', 'nWIn33', 'nWIn34', 'nWIn35', 'nWIn36', 'nWIn38', 'nWIn39', 'nWIn40', 'nWIn41', 'nWIn43', 'nWIn44', 'nWIn45', 'nWIn46', 'nWIn47', 'nWIn48',