In [10]:
# anomaly_detection_isolationforest.py
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

file_path = "ongoing_projects2_clean.csv"

# --- Load and clean data ---
df = pd.read_csv(file_path)
df = df.rename(columns=lambda c: c.strip())

# Detect and rename cumulative column
for c in df.columns:
    if "Cumulative" in c:
        df = df.rename(columns={c: "Cumulative_Expenditure_RsCr"})

# Standardize all important names
rename_map = {
    "Original Cost": "Original_Cost_RsCr",
    "Latest Revised Cost": "Latest_Revised_Cost_RsCr",
    "Project Count": "Project_Count",
    "STATE NAME": "State",
    "Allocated To": "Allocated_To"
}
df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

# Drop any unnecessary serial number column
df = df.drop(columns=[c for c in df.columns if "Sl" in c or "sl" in c], errors="ignore")

# --- Compute cost escalation percentage ---
if "Cost_Escalation_Pct" not in df.columns:
    df["Cost_Escalation_Pct"] = np.where(
        (df["Original_Cost_RsCr"].notna()) & (df["Original_Cost_RsCr"] != 0),
        (df["Latest_Revised_Cost_RsCr"] - df["Original_Cost_RsCr"]) / df["Original_Cost_RsCr"] * 100,
        np.nan
    )

# --- Prepare features for anomaly detection ---
df["Project_Count"] = pd.to_numeric(df["Project_Count"], errors="coerce").fillna(0)
df["Cumulative_Expenditure_RsCr"] = pd.to_numeric(df["Cumulative_Expenditure_RsCr"], errors="coerce").fillna(0)
df["Cost_Escalation_Pct"] = pd.to_numeric(df["Cost_Escalation_Pct"], errors="coerce").fillna(df["Cost_Escalation_Pct"].median())

features = ["Project_Count", "Cumulative_Expenditure_RsCr", "Cost_Escalation_Pct"]
X = df[features].values

# --- Scale & detect anomalies ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

iso = IsolationForest(contamination=0.03, random_state=42)
anom_pred = iso.fit_predict(X_scaled)

# IsolationForest returns -1 for anomaly, 1 for normal
df["Anomaly_Flag"] = np.where(anom_pred == -1, 1, 0)

# --- Output summary ---
print("✅ Anomaly detection complete!")
print(f"Total anomalies detected: {df['Anomaly_Flag'].sum()} out of {len(df)} records")

# Save updated dataset
df.to_csv("ongoing_projects_with_anomaly_flag.csv", index=False)
print("\nSaved: ongoing_projects_with_anomaly_flag.csv")

# Optional: show top anomalous rows (for quick inspection)
print("\nTop anomalous records:")
print(df[df["Anomaly_Flag"] == 1]
      .sort_values("Cost_Escalation_Pct", ascending=False)
      [["State","Allocated_To","Sector","Project_Count","Cost_Escalation_Pct"]]
      .head(10)
      .to_string(index=False))


✅ Anomaly detection complete!
Total anomalies detected: 8 out of 263 records

Saved: ongoing_projects_with_anomaly_flag.csv

Top anomalous records:
         State                                           Allocated_To                 Sector  Project_Count  Cost_Escalation_Pct
Andhra Pradesh Department of Water Resources, River\nDevelopment & GR        Water Resources              3           425.335963
         Assam                                      Ministry of Power Electricity Generation              1           314.862863
       Haryana                                   Ministry of Railways               Railways              4           136.360916
     Rajasthan                                   Ministry of Railways               Railways             21            87.263963
     PAN India                       Department of Telecommunications      Telecommunication              6            86.207548
 Uttar Pradesh                                   Ministry of Railways         