In [2]:
 # === Load and Prepare Data ===
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load data
df = pd.read_csv("data_for_algo2")

# Select numeric columns for analysis
num_cols = ['Expenditure', 'Progress', 'Org_Cost', 'Rev_Cost',
            'Cost_Diff', 'Cost_Ratio', 'Exp_Cost_Ratio']
X = df[num_cols].fillna(0)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === 3. Predictive Modeling: Cost Overrun ===
df['Cost_Overrun'] = (df['Rev_Cost'] > df['Org_Cost']).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, df['Cost_Overrun'], test_size=0.2, random_state=42)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("\nCost Overrun Prediction Report:")
print(classification_report(y_test, y_pred))



Cost Overrun Prediction Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       127
           1       1.00      1.00      1.00        53

    accuracy                           1.00       180
   macro avg       1.00      1.00      1.00       180
weighted avg       1.00      1.00      1.00       180



In [3]:
df.sample(10)

Unnamed: 0,SNo,State,Expenditure,Progress,Project_Name,Agency,Dateof_Approval,Start_Date,Org_DoC,Rev_DoC,Org_Cost,Rev_Cost,Cost_Diff,Cost_Ratio,Exp_Cost_Ratio,anomaly_score,lof_score,recon_error,combined_anomaly,Cost_Overrun
180,164.0,Jammu and Kashmir,7238.73,72.0,"Pakal Dul [Drangdhuran] Hydroelectric Project,...",National Highways & Infrastructure Development...,10/2014,02/2018,04/2020,12/2026,8112.12,12728.0,4615.88,1.568817,0.892225,0,0,0.001155,0.0,1
587,516.0,Chhattisgarh,14.42,60.0,4TH LINE - DADHAPARA-BELHA [9.2 KM] [PB-72/NR/...,SECR,,05/2024,11/2026,11/2026,169.06,169.06,0.0,0.99412,0.084794,0,0,0.002156,0.0,0
807,722.0,Uttar Pradesh,2051.69,94.0,Bhatni-Aurnihar with electrification excluding...,RVNL - II,03/2018,03/2017,12/2024,10/2025,2529.0,2529.46,0.46,0.999787,0.810945,0,0,0.000263,0.0,1
624,551.0,Gujarat,787.69,76.0,Rajkot - Kanalus Doubling Project [111.2 Kms],Western Railway [WR] - II,10/2021,12/2021,10/2026,10/2026,1080.58,1080.58,0.0,0.999075,0.728277,0,0,0.000199,0.0,0
101,94.0,Maharashtra,0.02,4.0,BALLARPUR NORTH WEST OC MINE,Western Coalfields Limited [WCL],12/2022,12/2022,,,360.81,360.81,0.0,0.997236,5.5e-05,0,0,0.000377,0.0,0
564,495.0,Bihar,1490.2,35.0,Araria Supaul New BG Line,East Central Railway [ECR] - I,06/2018,03/2009,03/2024,01/2027,1605.0,2514.0,909.0,1.56538,0.927895,0,0,0.004557,0.0,1
159,147.0,Tamil Nadu,161.73,50.0,50 MW Solar PP Minedout Area,NLC India Limited [NLCIL],,02/2024,03/2025,09/2025,218.03,218.03,0.0,0.995434,0.738392,0,0,0.002442,0.0,0
296,266.0,Maharashtra,1045.32,25.0,Nagpur Metro Rail Phase II Development Project,Maharashtra Metro Rail Corporation Limited [MM...,12/2022,12/2022,,,6708.0,6708.0,0.0,0.999851,0.155809,0,0,0.003065,0.0,0
223,202.0,Karnataka,1332.21,46.0,Unnamed Project - Unknown Agency,Unknown Agency,12/2023,12/2023,12/2025,03/2026,2889.39,2889.39,0.0,0.999654,0.46091,0,0,0.001039,0.0,0
166,151.0,Jharkhand,596.94,34.0,Tubed Coal Mine,Damodar Valley Corporation,,10/2016,05/2026,05/2026,1581.0,1581.0,0.0,0.999368,0.377332,0,0,0.000788,0.0,0


In [4]:
df.to_csv("costoverrun.csv", index=False)