In [1]:
! pip install flaml[automl] matplotlib openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting flaml[automl]
  Downloading FLAML-2.3.3-py3-none-any.whl.metadata (16 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FLAML-2.3.3-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[

In [2]:
import numpy as np
import pandas as pd
import os
from flaml import AutoML
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load and Merge All Datasets

# Load all datasets
energy_usage = pd.read_csv("/kaggle/input/public-sector-network/energy_usage.csv")
environment = pd.read_csv("/kaggle/input/public-sector-network/environment.csv")
maintenance = pd.read_csv("/kaggle/input/public-sector-network/maintenance.csv")
nodes = pd.read_csv("/kaggle/input/public-sector-network/nodes.csv")
procurement = pd.read_csv("/kaggle/input/public-sector-network/procurement.csv")
recommendations = pd.read_csv("/kaggle/input/public-sector-network/recommendations.csv")
traffic = pd.read_csv("/kaggle/input/public-sector-network/traffic.csv")

# Merge datasets on Node_ID
data = pd.merge(energy_usage, nodes, on="Node_ID", how="left")
data = pd.merge(data, environment, left_on="Region", right_on="Region_Name", how="left")
data = pd.merge(data, maintenance, on="Node_ID", how="left")
data = pd.merge(data, procurement, on="Node_ID", how="left")
data = pd.merge(data, recommendations, on="Node_ID", how="left")
data = pd.merge(data, traffic, on=["Node_ID", "Date"], how="left")

# Fill missing values with 0
data = data.fillna(0)

# Save the merged dataset to '/kaggle/working/'
merged_data_path = '/kaggle/working/sustainability_merged_data.csv'
data.to_csv(merged_data_path, index=False)
print(f"Merged dataset saved to {merged_data_path}")

# Step 2: Feature Engineering

# Extract month, day, and hour from the 'Date' column
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

# Drop irrelevant columns
data = data.drop(columns=['Region_Name', 'Region_ID', 'Log_ID', 'Deployment_ID', 'Recommendation_ID', 'Date'])

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Energy_Source', 'Region', 'Existing_Infrastructure', 'Type', 
                                     'Issue_Type', 'Equipment_Used', 'Vendor_Details', 'Action_Type', 
                                     'Peak_Usage_Time'], drop_first=True)

# Step 3: Define Impact Scores

# Define impact scores based on energy usage, carbon emissions, and infrastructure metrics
data['Energy_Impact_Score'] = data['Energy_Usage_kWh'] / data['Population_Served']
data['Emissions_Impact_Score'] = data['Carbon_Emissions_kg_CO2'] / data['Population_Served']
data['Infrastructure_Impact_Score'] = data['Resolution_Time_Hours'] * data['Downtime_Events']

# Step 4: Split into Features (X) and Targets (y)

# Features
X = data.drop(columns=['Energy_Usage_kWh', 'Carbon_Emissions_kg_CO2', 'Energy_Impact_Score', 
                       'Emissions_Impact_Score', 'Infrastructure_Impact_Score'])

# Targets
y_energy = data['Energy_Impact_Score']
y_emissions = data['Emissions_Impact_Score']
y_infrastructure = data['Infrastructure_Impact_Score']

# Split into train and test sets
X_train, X_test, y_train_energy, y_test_energy = train_test_split(X, y_energy, test_size=0.2, random_state=42)
X_train, X_test, y_train_emissions, y_test_emissions = train_test_split(X, y_emissions, test_size=0.2, random_state=42)
X_train, X_test, y_train_infrastructure, y_test_infrastructure = train_test_split(X, y_infrastructure, test_size=0.2, random_state=42)

# Step 5: Train Regression Models Using FLAML

# Train a model for Energy Impact Score
automl_energy = AutoML()
settings = {
    "time_budget": 3600,  # 1 hour
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "energy_impact_model.log",
    "seed": 42,
}
automl_energy.fit(X_train=X_train, y_train=y_train_energy, **settings)

# Train a model for Emissions Impact Score
automl_emissions = AutoML()
settings = {
    "time_budget": 3600,  # 1 hour
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "emissions_impact_model.log",
    "seed": 42,
}
automl_emissions.fit(X_train=X_train, y_train=y_train_emissions, **settings)

# Train a model for Infrastructure Impact Score
automl_infrastructure = AutoML()
settings = {
    "time_budget": 3600,  # 1 hour
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "infrastructure_impact_model.log",
    "seed": 42,
}
automl_infrastructure.fit(X_train=X_train, y_train=y_train_infrastructure, **settings)

# Step 6: Evaluate Models on Test Set

# Evaluate Energy Impact Score Model
y_pred_energy = automl_energy.predict(X_test)
rmse_energy = np.sqrt(mean_squared_error(y_test_energy, y_pred_energy))
r2_energy = r2_score(y_test_energy, y_pred_energy)
print(f"Energy Impact Score Model - RMSE: {rmse_energy}, R²: {r2_energy}")

# Evaluate Emissions Impact Score Model
y_pred_emissions = automl_emissions.predict(X_test)
rmse_emissions = np.sqrt(mean_squared_error(y_test_emissions, y_pred_emissions))
r2_emissions = r2_score(y_test_emissions, y_pred_emissions)
print(f"Emissions Impact Score Model - RMSE: {rmse_emissions}, R²: {r2_emissions}")

# Evaluate Infrastructure Impact Score Model
y_pred_infrastructure = automl_infrastructure.predict(X_test)
rmse_infrastructure = np.sqrt(mean_squared_error(y_test_infrastructure, y_pred_infrastructure))
r2_infrastructure = r2_score(y_test_infrastructure, y_pred_infrastructure)
print(f"Infrastructure Impact Score Model - RMSE: {rmse_infrastructure}, R²: {r2_infrastructure}")

# Step 7: Save the Models

# Save the trained models
joblib.dump(automl_energy, '/kaggle/working/energy_impact_model.pkl')
joblib.dump(automl_emissions, '/kaggle/working/emissions_impact_model.pkl')
joblib.dump(automl_infrastructure, '/kaggle/working/infrastructure_impact_model.pkl')

# Step 8: Hardcoded User Input

def get_hardcoded_input():
    """
    Define hardcoded user input for prediction.
    """
    user_input = {
        'Node_ID': 'Node_123',  # Example Node ID
        'Population_Served': 1000,  # Example population served
        'Region': 'Russia',  # Example region
        'Energy_Source': 'Solar',  # Example energy source
        'Peak_Usage_Time': 'Afternoon',  # Example peak usage time
        'Type': 'Government Office',  # Example type of organization
        'Existing_Infrastructure': 'Yes',  # Example existing infrastructure
    }
    start_date = '2025-10-01'  # Example start date
    end_date = '2025-10-07'  # Example end date
    return user_input, start_date, end_date

# Step 9: Create a Function to Predict Impact Scores

def predict_impact_scores(input_data, start_date, end_date):
    """
    Predict impact scores based on hardcoded input for a date range.
    input_data: A dictionary containing the input features.
    start_date: Start date for prediction (YYYY-MM-DD).
    end_date: End date for prediction (YYYY-MM-DD).
    """
    # Load the saved models
    energy_model = joblib.load('/kaggle/working/energy_impact_model.pkl')
    emissions_model = joblib.load('/kaggle/working/emissions_impact_model.pkl')
    infrastructure_model = joblib.load('/kaggle/working/infrastructure_impact_model.pkl')
    
    # Generate a date range from start_date to end_date
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Initialize lists to store predictions
    predictions = []
    
    # Iterate over each date in the range
    for date in date_range:
        # Extract month and day
        month = date.month
        day = date.day
        
        # Create a copy of the input data
        input_df = pd.DataFrame([input_data])
        
        # Add month and day
        input_df['Month'] = month
        input_df['Day'] = day
        
        # Ensure the input data has the same features as the training data
        for col in X.columns:
            if col not in input_df.columns:
                input_df[col] = 0
        
        # Reorder columns to match the training data
        input_df = input_df[X.columns]
        
        # Predict
        energy_score = energy_model.predict(input_df)
        emissions_score = emissions_model.predict(input_df)
        infrastructure_score = infrastructure_model.predict(input_df)
        
        # Append predictions to the list
        predictions.append({
            'Date': date.strftime('%Y-%m-%d'),
            'Energy_Impact_Score': energy_score[0],
            'Emissions_Impact_Score': emissions_score[0],
            'Infrastructure_Impact_Score': infrastructure_score[0]
        })
    
    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions)
    
    return predictions_df

# Step 10: Create a Function to Generate Reports

def generate_report(predictions_df):
    """
    Generate a detailed report based on the predictions.
    predictions_df: A DataFrame containing predictions for each date.
    """
    print("\n=== Sustainability Report ===")
    print(f"Date Range: {predictions_df['Date'].min()} to {predictions_df['Date'].max()}")
    
    # Summary Statistics
    print("\nSummary Statistics:")
    print(f"Average Energy Impact Score: {predictions_df['Energy_Impact_Score'].mean():.2f} kWh/person")
    print(f"Average Emissions Impact Score: {predictions_df['Emissions_Impact_Score'].mean():.2f} kg CO₂/person")
    print(f"Average Infrastructure Impact Score: {predictions_df['Infrastructure_Impact_Score'].mean():.2f}")
    
    # Detailed Predictions
    print("\nDetailed Predictions:")
    print(predictions_df)
    
    # Actionable Insights
    print("\nActionable Insights:")
    if predictions_df['Energy_Impact_Score'].mean() > 1.0:
        print("- **Energy**: Consider implementing energy-saving measures or switching to renewable energy sources.")
    else:
        print("- **Energy**: Energy usage is relatively efficient. Maintain current practices and monitor for changes.")
    
    if predictions_df['Emissions_Impact_Score'].mean() > 0.5:
        print("- **CO₂ Emissions**: Explore carbon offset programs or transition to low-emission energy sources.")
    else:
        print("- **CO₂ Emissions**: CO₂ emissions are relatively low. Continue monitoring and aim for further reductions.")
    
    if predictions_df['Infrastructure_Impact_Score'].mean() > 5.0:
        print("- **Infrastructure**: Prioritize infrastructure maintenance and consider upgrading critical systems.")
    else:
        print("- **Infrastructure**: Infrastructure is relatively stable. Continue regular maintenance and monitoring.")
    
    print("\n=== End of Report ===")

# Step 11: Run the Code with Hardcoded Input

if __name__ == "__main__":
    # Get hardcoded user input
    user_input, start_date, end_date = get_hardcoded_input()
    
    # Predict impact scores for the date range
    predictions_df = predict_impact_scores(user_input, start_date, end_date)
    
    # Generate and display the report
    generate_report(predictions_df)

Merged dataset saved to /kaggle/working/sustainability_merged_data.csv
[flaml.automl.logger: 01-25 12:46:19] {1728} INFO - task = regression
[flaml.automl.logger: 01-25 12:46:19] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 01-25 12:46:20] {1838} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 01-25 12:46:20] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 01-25 12:46:20] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-25 12:46:20] {2393} INFO - Estimated sufficient time budget=53684s. Estimated necessary time budget=464s.
[flaml.automl.logger: 01-25 12:46:20] {2442} INFO -  at 10.1s,	estimator lgbm's best error=0.6689,	best estimator lgbm's best error=0.6689
[flaml.automl.logger: 01-25 12:46:20] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-25 12:46:21] {2442} INFO -  at 10.2s,	estimator lgbm's best error=0

  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0



=== Sustainability Report ===
Date Range: 2025-10-01 to 2025-10-07

Summary Statistics:
Average Energy Impact Score: 0.93 kWh/person
Average Emissions Impact Score: 0.74 kg CO₂/person
Average Infrastructure Impact Score: -0.12

Detailed Predictions:
         Date  Energy_Impact_Score  Emissions_Impact_Score  \
0  2025-10-01             0.883705                0.940260   
1  2025-10-02             0.949093                0.856491   
2  2025-10-03             0.918185                0.761441   
3  2025-10-04             1.013122                0.330398   
4  2025-10-05             0.903369                0.592422   
5  2025-10-06             0.807482                0.886288   
6  2025-10-07             1.061466                0.841021   

   Infrastructure_Impact_Score  
0                    -0.113101  
1                    -0.121298  
2                    -0.121298  
3                    -0.121298  
4                    -0.120957  
5                    -0.116722  
6                    

  input_df[col] = 0
  input_df[col] = 0


In [3]:
import pandas as pd

# Load the dataset
sustainability_data = pd.read_csv("/kaggle/working/sustainability_merged_data.csv")

# Display the first few rows of the dataset
sustainability_data.head()

Unnamed: 0,Node_ID,Date,Energy_Usage_kWh,Peak_Usage_Time,Energy_Source,Carbon_Emissions_kg_CO2,Type,Region,Latitude,Longitude,...,Quantity,Recommendation_ID,Action_Type,Estimated_Savings_kWh,Estimated_Cost_Savings_USD,Implementation_Difficulty,Time,Data_Usage_GB,Peak_Usage_GB,Downtime_Events
0,Node_223,2024-01-01,1341.412353,Afternoon,Grid,347.773911,Health Center,Pacific Ocean,1.118835,-155.548343,...,1434,Rec_140,Upgrade Equipment,552.177043,5649.116745,High,0,0.0,0.0,0.0
1,Node_223,2024-01-01,1341.412353,Afternoon,Grid,347.773911,Health Center,Pacific Ocean,1.118835,-155.548343,...,1434,Rec_158,Optimize Routing,1139.442537,2717.843753,Low,0,0.0,0.0,0.0
2,Node_223,2024-01-01,1341.412353,Afternoon,Grid,347.773911,Health Center,Pacific Ocean,1.118835,-155.548343,...,1434,Rec_279,Implement Load Balancing,1465.278719,2177.269611,Low,0,0.0,0.0,0.0
3,Node_223,2024-01-01,1341.412353,Afternoon,Grid,347.773911,Health Center,Pacific Ocean,1.118835,-155.548343,...,1434,Rec_361,Optimize Routing,1456.952229,4836.9722,Low,0,0.0,0.0,0.0
4,Node_223,2024-01-01,1341.412353,Afternoon,Grid,347.773911,Health Center,Pacific Ocean,1.118835,-155.548343,...,1434,Rec_482,Optimize Routing,1246.889801,12374.544894,Low,0,0.0,0.0,0.0


In [4]:
# Display basic information about the dataset
sustainability_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447936 entries, 0 to 447935
Data columns (total 38 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Node_ID                     447936 non-null  object 
 1   Date                        447936 non-null  object 
 2   Energy_Usage_kWh            447936 non-null  float64
 3   Peak_Usage_Time             447936 non-null  object 
 4   Energy_Source               447936 non-null  object 
 5   Carbon_Emissions_kg_CO2     447936 non-null  float64
 6   Type                        447936 non-null  object 
 7   Region                      447936 non-null  object 
 8   Latitude                    447936 non-null  float64
 9   Longitude                   447936 non-null  float64
 10  Population_Served           447936 non-null  int64  
 11  Connectivity_Status         447936 non-null  object 
 12  Existing_Infrastructure     447936 non-null  object 
 13  Region_ID     