In [1]:
! pip install flaml[automl] matplotlib openml

Collecting openml
  Downloading openml-0.15.0-py3-none-any.whl.metadata (9.9 kB)
Collecting flaml[automl]
  Downloading FLAML-2.3.3-py3-none-any.whl.metadata (16 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.0/158.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FLAML-2.3.3-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB

In [2]:
import numpy as np
import pandas as pd
import os
from flaml import AutoML
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load and Preprocess the Data

# Load the datasets
energy_usage = pd.read_csv("/kaggle/input/public-sector-network/energy_usage.csv")
nodes = pd.read_csv("/kaggle/input/public-sector-network/nodes.csv")

# Merge the datasets on Node_ID
data = pd.merge(energy_usage, nodes, on="Node_ID")

# Save the merged dataset to '/kaggle/working/'
merged_data_path = '/kaggle/working/energy_usage_merged_data.csv'
data.to_csv(merged_data_path, index=False)
print(f"Merged dataset saved to {merged_data_path}")

# Feature engineering
# Extract month, day, and hour from the 'Date' column
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

# Drop irrelevant columns
data = data[['Node_ID', 'Month', 'Day', 'Peak_Usage_Time', 'Energy_Source', 
             'Region', 'Existing_Infrastructure', 'Type', 'Population_Served',
             'Energy_Usage_kWh', 'Carbon_Emissions_kg_CO2']]

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Energy_Source', 'Region', 'Existing_Infrastructure', 'Type'], drop_first=True)

# Split into features (X) and target (y)
X = data.drop(columns=['Energy_Usage_kWh', 'Carbon_Emissions_kg_CO2'])
y_energy = data['Energy_Usage_kWh']
y_carbon = data['Carbon_Emissions_kg_CO2']

# Split into train and test sets
X_train, X_test, y_train_energy, y_test_energy = train_test_split(X, y_energy, test_size=0.2, random_state=42)
X_train, X_test, y_train_carbon, y_test_carbon = train_test_split(X, y_carbon, test_size=0.2, random_state=42)

# Step 2: Train Regression Models Using FLAML

# Train a model for Energy Usage
automl_energy = AutoML()
settings = {
    "time_budget": 600,  # 10 minutes
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "energy_usage_model.log",
    "seed": 42,
}
automl_energy.fit(X_train=X_train, y_train=y_train_energy, **settings)

# Train a model for Carbon Emissions
automl_carbon = AutoML()
settings = {
    "time_budget": 600,  # 10 minutes
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "carbon_emissions_model.log",
    "seed": 42,
}
automl_carbon.fit(X_train=X_train, y_train=y_train_carbon, **settings)

# Step 3: Evaluate the Models

# Evaluate Energy Usage Model
y_pred_energy = automl_energy.predict(X_test)
rmse_energy = np.sqrt(mean_squared_error(y_test_energy, y_pred_energy))
r2_energy = r2_score(y_test_energy, y_pred_energy)

print(f"Energy Usage Model - RMSE: {rmse_energy}, R²: {r2_energy}")

# Evaluate Carbon Emissions Model
y_pred_carbon = automl_carbon.predict(X_test)
rmse_carbon = np.sqrt(mean_squared_error(y_test_carbon, y_pred_carbon))
r2_carbon = r2_score(y_test_carbon, y_pred_carbon)

print(f"Carbon Emissions Model - RMSE: {rmse_carbon}, R²: {r2_carbon}")

# Step 4: Save the Models

# Save the trained models
joblib.dump(automl_energy, '/kaggle/working/energy_usage_model.pkl')
joblib.dump(automl_carbon, '/kaggle/working/carbon_emissions_model.pkl')

# Step 5: Hardcoded User Input

def get_hardcoded_input():
    """
    Define hardcoded user input for prediction.
    """
    user_input = {
        'Node_ID': 'Node_123',  # Example Node ID
        'Population_Served': 5000,  # Example population served
        'Region': 'Russia',  # Example region
        'Energy_Source': 'Solar',  # Example energy source
        'Peak_Usage_Time': 'Afternoon',  # Example peak usage time
        'Type': 'School',  # Example type of organization
        'Existing_Infrastructure': 'Yes'  # Example existing infrastructure
    }
    start_date = '2025-10-01'  # Example start date
    end_date = '2025-10-07'  # Example end date
    return user_input, start_date, end_date

# Step 6: Integrate with the Prediction Function

def predict_energy_and_carbon(input_data, start_date, end_date):
    """
    Predict energy usage and carbon emissions based on hardcoded input for a date range.
    input_data: A dictionary containing the input features.
    start_date: Start date for prediction (YYYY-MM-DD).
    end_date: End date for prediction (YYYY-MM-DD).
    """
    # Load the saved models
    energy_model = joblib.load('/kaggle/working/energy_usage_model.pkl')
    carbon_model = joblib.load('/kaggle/working/carbon_emissions_model.pkl')
    
    # Generate a date range from start_date to end_date
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Initialize lists to store predictions
    predictions = []
    
    # Iterate over each date in the range
    for date in date_range:
        # Extract month and day
        month = date.month
        day = date.day
        
        # Create a copy of the input data
        input_df = pd.DataFrame([input_data])
        
        # Add month and day
        input_df['Month'] = month
        input_df['Day'] = day
        
        # Ensure the input data has the same features as the training data
        # Add missing columns and fill with 0
        for col in X.columns:
            if col not in input_df.columns:
                input_df[col] = 0
        
        # Reorder columns to match the training data
        input_df = input_df[X.columns]
        
        # Predict
        energy_pred = energy_model.predict(input_df)
        carbon_pred = carbon_model.predict(input_df)
        
        # Append predictions to the list
        predictions.append({
            'Date': date.strftime('%Y-%m-%d'),
            'Energy_Usage_kWh': energy_pred[0],
            'Carbon_Emissions_kg_CO2': carbon_pred[0]
        })
    
    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions)
    
    return predictions_df

# Step 7: Run the Prediction with Hardcoded Input

if __name__ == "__main__":
    # Get hardcoded user input
    user_input, start_date, end_date = get_hardcoded_input()
    
    # Predict energy usage and carbon emissions for the date range
    predictions_df = predict_energy_and_carbon(user_input, start_date, end_date)
    
    # Display the results
    print("\nPrediction Results:")
    print(predictions_df)

Merged dataset saved to /kaggle/working/energy_usage_merged_data.csv
[flaml.automl.logger: 01-25 00:35:10] {1728} INFO - task = regression
[flaml.automl.logger: 01-25 00:35:10] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 01-25 00:35:10] {1838} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 01-25 00:35:10] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 01-25 00:35:10] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-25 00:35:10] {2393} INFO - Estimated sufficient time budget=4057s. Estimated necessary time budget=35s.
[flaml.automl.logger: 01-25 00:35:10] {2442} INFO -  at 1.0s,	estimator lgbm's best error=865.9075,	best estimator lgbm's best error=865.9075
[flaml.automl.logger: 01-25 00:35:10] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-25 00:35:11] {2442} INFO -  at 1.4s,	estimator lgbm's best error=865.4801