In [1]:
# budget_efficiency_clustering.py
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

file_path = "ongoing_projects2_clean.csv"
df = pd.read_csv(file_path).rename(columns=lambda c: c.strip())

# Minimal renames (match your file)
for c in df.columns:
    if "Original Cost" in c:
        df = df.rename(columns={c: "Original_Cost_RsCr"})
    if "Latest Revised Cost" in c:
        df = df.rename(columns={c: "Latest_Revised_Cost_RsCr"})
    if "Project Count" in c:
        df = df.rename(columns={c: "Project_Count"})
    if "Cumulative" in c:
        df = df.rename(columns={c: c,})
df = df.rename(columns={col: ("Cumulative_Expenditure_RsCr" if "Cumulative" in col else col) for col in df.columns})

# Feature selection
features = ["Project_Count", "Cumulative_Expenditure_RsCr", "Cost_Escalation_Pct"]
# compute escalation if missing
if "Cost_Escalation_Pct" not in df.columns:
    df["Cost_Escalation_Pct"] = np.where(
        (df.get("Original_Cost_RsCr").notna()) & (df.get("Original_Cost_RsCr") != 0),
        (df.get("Latest_Revised_Cost_RsCr") - df.get("Original_Cost_RsCr")) / df.get("Original_Cost_RsCr") * 100,
        np.nan
    )

# Fill NaNs (median for escalation, zeros for expenditure/project_count if necessary)
df["Cost_Escalation_Pct"] = df["Cost_Escalation_Pct"].fillna(df["Cost_Escalation_Pct"].median())
df["Cumulative_Expenditure_RsCr"] = df["Cumulative_Expenditure_RsCr"].fillna(0)
df["Project_Count"] = df["Project_Count"].fillna(0)

X = df[features].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# KMeans clustering - k=3 (you can change)
k = 3
model = KMeans(n_clusters=k, random_state=42)
labels = model.fit_predict(X_scaled)
df["Cluster"] = labels

# Cluster centers in original scale
centers_orig = scaler.inverse_transform(model.cluster_centers_)
centers_df = pd.DataFrame(centers_orig, columns=features)
centers_df["Cluster"] = centers_df.index

print("Cluster sizes:\n", df["Cluster"].value_counts().sort_index())
print("\nCluster centers (approx original scale):\n", centers_df.to_string(index=False))

# Save cluster assignment
df.to_csv("clustering_with_labels.csv", index=False)
print("\nSaved clustering_with_labels.csv")


Cluster sizes:
 Cluster
0    251
1      7
2      5
Name: count, dtype: int64

Cluster centers (approx original scale):
  Project_Count  Cumulative_Expenditure_RsCr  Cost_Escalation_Pct  Cluster
      3.163347                  6900.048845            11.050748        0
     25.142857                136581.305714            56.980389        1
      1.600000                 11503.374000           264.594328        2

Saved clustering_with_labels.csv
