In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv("../data/processed/cleaned_data.csv")

df.head()


Unnamed: 0,Waste_Generated,Recycling_Rate,Population_Density,Municipal_Efficiency_Score,Cost_of_Waste_Management,Awareness_Campaigns_Count,Landfill_Capacity,Landfill_Lat,Landfill_Long,Distance_to_Landfill_km,...,City_Visakhapatnam,Waste_Type_Construction,Waste_Type_E-Waste,Waste_Type_Hazardous,Waste_Type_Organic,Waste_Type_Plastic,Disposal_Method_Composting,Disposal_Method_Incineration,Disposal_Method_Landfill,Disposal_Method_Recycling
0,0.483872,68,-0.34686,0.929612,0.217581,0.674989,-0.688562,22.4265,77.4931,-1.620291,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,-1.465258,56,-0.34686,-1.394418,-0.00036,0.345348,-0.688562,22.4265,77.4931,-1.620291,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1.041074,53,-0.34686,0.348604,0.479424,0.510168,-0.688562,22.4265,77.4931,-1.620291,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.316444,56,-0.34686,-1.394418,-1.003829,0.674989,-0.688562,22.4265,77.4931,-1.620291,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.082665,44,-0.34686,-0.232403,-0.437026,1.00463,-0.688562,22.4265,77.4931,-1.620291,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
model_data = df.drop("Recycling_Rate", axis=1)
y = df["Recycling_Rate"]
print("\nFeatures:", model_data.shape)
print("\nTarget Summary:\n", y.describe())


Features: (850, 53)

Target Summary:
 count    850.000000
mean      57.076471
std       16.129994
min       30.000000
25%       43.000000
50%       56.000000
75%       71.000000
max       85.000000
Name: Recycling_Rate, dtype: float64


In [4]:
#Splitting data into train and test 

X_train, X_test, y_train, y_test = train_test_split(model_data, y, test_size=0.2, random_state=42)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)


Training Set Shape: (680, 53)
Testing Set Shape: (170, 53)


In [5]:
# Training the models
from sklearn.model_selection import cross_val_score
# Define models
models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Function to train and evaluate
def train_model_and_evaluate(models, X_train, y_train, X_test, y_test):
    
    df = pd.DataFrame(columns=['Model', 'RMSE', 'R2', 'CV R2'])
    
    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict and evaluate
        y_pred = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)  # RMSE
        r2 = r2_score(y_test, y_pred)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
        cv_r2_mean = cv_scores.mean()
        
        # Store results
        df.loc[len(df)] = {'Model': model_name, 'RMSE': rmse, 'R2': r2, 'CV R2': cv_r2_mean}
    
    return df.sort_values(by='R2', ascending=False).reset_index(drop=True)

# Run the evaluation
results = train_model_and_evaluate(models, X_train, y_train, X_test, y_test)
results

Unnamed: 0,Model,RMSE,R2,CV R2
0,RandomForest,17.419508,-0.11553,-0.151821
1,GradientBoosting,17.945586,-0.183927,-0.19213
2,XGBoost,19.535721,-0.403035,-0.329376


In [6]:
from sklearn.model_selection import GridSearchCV
param_grids = {
    'Random Forest': {
        'estimator': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],  # Research suggests 100-200 for balance
            'max_depth': [10, 20],  # Moderate for environmental data
            'min_samples_split': [5, 10],  # Reduces noise
            'min_samples_leaf': [2, 4]  # Prevents overfitting
        }
    },
    'XGBoost': {
        'estimator': XGBRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],  # 100-200 for waste prediction
            'max_depth': [3, 6],  # Low to moderate for regression
            'learning_rate': [0.01, 0.1],  # Low rates improve accuracy
            'subsample': [0.8, 1.0],  # Handles subsample variance
            'min_child_weight': [1, 5]  # Controls tree growth
        }
    },
    'Gradient Boosting': {
        'estimator': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],  # Balanced for RMSE
            'max_depth': [3, 6],  # Moderate for solid waste data
            'learning_rate': [0.01, 0.1]  # Low for better convergence
        }
    }
}

# Store results
tuned_models = pd.DataFrame(columns=['Model', 'RMSE', 'R2'])

# Loop through models
for model_name, config in param_grids.items():
    print(f"\n🔍 Tuning {model_name}...")
    
    grid_search = GridSearchCV(
        estimator=config['estimator'],
        param_grid=config['params'],
        cv=3,
        scoring='r2',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)  
    
    y_pred = grid_search.predict(X_test)
    
    rmse =root_mean_squared_error(y_test, y_pred)  
    r2 = r2_score(y_test, y_pred)
    
    tuned_models.loc[len(tuned_models)] = {
        'Model': model_name,
        'RMSE': rmse,
        'R2': r2
    }
    
    print(f"✅ Best Parameters: {grid_search.best_params_}")
    print(f"📊 RMSE: {rmse:.4f}, R²: {r2:.4f}")

tuned_models = tuned_models.sort_values(by='RMSE', ascending=True).reset_index(drop=True)
tuned_models


🔍 Tuning Random Forest...
✅ Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
📊 RMSE: 17.3367, R²: -0.1049

🔍 Tuning XGBoost...
✅ Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 1.0}
📊 RMSE: 16.7132, R²: -0.0269

🔍 Tuning Gradient Boosting...
✅ Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
📊 RMSE: 16.7513, R²: -0.0316


Unnamed: 0,Model,RMSE,R2
0,XGBoost,16.713175,-0.026898
1,Gradient Boosting,16.751348,-0.031595
2,Random Forest,17.336696,-0.104949


In [7]:
#From the above tuning, I have got the best model and it comes out to be XGBoost which is a very robust model and have been widely used in various regression tasks.
best_model_name = tuned_models.iloc[0]['Model']
best_model = XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.01, min_child_weight=5, subsample=1)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
RMSE = root_mean_squared_error(y_test, y_pred)
R2 = r2_score(y_test, y_pred)
print(f"Best Model Performance:\nRMSE: {RMSE:.4f}\nR²: {R2:.4f}")

Best Model Performance:
RMSE: 16.7132
R²: -0.0269


In [8]:
import joblib

#Saving the best model

joblib.dump(best_model, f'../models/{best_model_name.lower().replace(" ", "_")}_tuned_model.pkl')

print(f"Best model saved to ../models/{best_model_name.lower().replace(' ', '_')}_tuned_model.pkl")


Best model saved to ../models/xgboost_tuned_model.pkl


In [9]:
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
1,Population_Density,0.118811
47,Waste_Type_Organic,0.087555
8,Distance_to_Landfill_km,0.079401
51,Disposal_Method_Landfill,0.07113
5,Landfill_Capacity,0.069153
22,City_Indore,0.068841
6,Landfill_Lat,0.068798
49,Disposal_Method_Composting,0.068464
3,Cost_of_Waste_Management,0.063611
0,Waste_Generated,0.057571
