# Sensitivity Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.inspection import permutation_importance
import time
import os

In [2]:
file_path = r"C:\Users\sb013698\Desktop\github\Hybrid UBEM Tool\Synthetic Data Generation\Datasets"
mydf = pd.read_csv(os.path.join(file_path, "sensitivity_analysis.csv"))
mydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2048 entries, 0 to 2047
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Scenario                   2048 non-null   object 
 1   ground_r_value             2048 non-null   float64
 2   infiltration               2048 non-null   float64
 3   heating_set_point          2048 non-null   float64
 4   cooling_set_point          2048 non-null   float64
 5   heating_cop                2048 non-null   float64
 6   cooling_cop                2048 non-null   float64
 7   equipment_density          2048 non-null   int64  
 8   lighting_density           2048 non-null   int64  
 9   occupant_density           2048 non-null   float64
 10  scheduled_ventilation_ach  2048 non-null   float64
 11  dhw_flow_rate              2048 non-null   float64
 12  Total Operational Energy   2048 non-null   float64
 13  Natural Gas                2048 non-null   float

In [3]:
mydf.head()

Unnamed: 0,Scenario,ground_r_value,infiltration,heating_set_point,cooling_set_point,heating_cop,cooling_cop,equipment_density,lighting_density,occupant_density,scheduled_ventilation_ach,dhw_flow_rate,Total Operational Energy,Natural Gas,Electricity
0,S1,3.288,0.1,20.0,23.1,0.8,4.5,5,5,0.013,0.1,1.6e-05,17032086.19,3170703.0,13861383.29
1,S2,3.288,0.1,20.0,23.1,0.8,4.5,5,5,0.013,0.1,0.000709,31739539.62,17871750.0,13867792.52
2,S3,3.288,0.1,20.0,23.1,0.8,4.5,5,5,0.013,1.0,1.6e-05,17310429.65,3447513.0,13862916.64
3,S4,3.288,0.1,20.0,23.1,0.8,4.5,5,5,0.013,1.0,0.000709,32012303.98,18158940.0,13853367.72
4,S5,3.288,0.1,20.0,23.1,0.8,4.5,5,5,0.103,0.1,1.6e-05,18432652.29,4050000.0,14382651.98


# Linear Regression

In [4]:
excluded_columns = [
    "Scenario",
    "Total Operational Energy",
    "Natural Gas",
    "Electricity",
]

data = mydf.drop(columns=excluded_columns)
target = mydf["Total Operational Energy"]

X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=42,
)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
linear = LinearRegression()
linear.fit(X_train_scaled, y_train)
preds = linear.predict(X_test_scaled)

r2_value = r2_score(y_test, preds)
mse = mean_squared_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f"R2: {r2_value:.2f}")
print(f"MSE: {mse:.2f}")
print(f"MAPE: {mape:.2f}")

R2: 0.98
MSE: 2771870368623.01
MAPE: 0.03


In [6]:
mydf.corr(numeric_only=True)["Total Operational Energy"][:11]

ground_r_value              -0.012966
infiltration                 0.207758
heating_set_point            0.544912
cooling_set_point           -0.046285
heating_cop                 -0.069028
cooling_cop                 -0.021929
equipment_density            0.369311
lighting_density             0.371184
occupant_density             0.074459
scheduled_ventilation_ach    0.016920
dhw_flow_rate                0.595097
Name: Total Operational Energy, dtype: float64

# Calculate Feature Importances

In [7]:
def calculate_feature_importances(X, y):
    
    # Utilize 100% of data for training since our aim is not to generalize here.
    # Instead, we want to find the exact correlation between features and target specific to this data.
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    
    # 1) Linear Regression
    linear = LinearRegression()
    linear.fit(X_scaled, y)
    total_abs_importances = np.sum(abs(linear.coef_))
    relative_importances = [abs(c)/total_abs_importances for c in linear.coef_]    
    # 2) Random Forest Regressor
    forest = RandomForestRegressor(
        n_estimators=20,
        criterion="squared_error",
        max_depth=10,
        min_samples_leaf=1,
        min_samples_split=2,
        random_state=8,
    )
    forest.fit(X.values, y)
    rf_feature_importances = forest.feature_importances_
    
    # 3) Permutation Importance
    permutation_results = permutation_importance(
        forest, 
        X.values, 
        y, 
        scoring="neg_mean_absolute_error",
        n_repeats=100, 
        random_state=8,
    )
    permutation_feature_importances = permutation_results.importances_mean
    
    # 4) Pearson Correlation Coefficient
    correlations = abs(mydf.corr(numeric_only=True)[target_name][:11].values)
    
    return relative_importances, rf_feature_importances, permutation_feature_importances, correlations

In [8]:
excluded_columns = [
    "Scenario",
    "Total Operational Energy",
    "Natural Gas",
    "Electricity",
]

data = mydf.drop(columns=excluded_columns)
col_list = data.columns

target_list = ["Total Operational Energy", "Natural Gas", "Electricity"]
target_frames = {}

for target_name in target_list:
    
    target = mydf[target_name]
    X = data.copy()
    y = target.values
    rel_imp, rf_imp, perm_imp, corr = calculate_feature_importances(X, y)
    
    # Create a data frame for each target
    df = pd.DataFrame(
        {
            "Feature Name": col_list,
            "Relative Importance (LR)": rel_imp,
            "Feature Importance (RF)": rf_imp,
            "Permutation Importance (RF)": perm_imp,
            "Correlation": corr,
        }
    )
    target_frames[target_name] = df
    
# Access the DataFrames using the target names
total_operational_energy_df = target_frames["Total Operational Energy"]
natural_gas_df = target_frames["Natural Gas"]
electricity_df = target_frames["Electricity"]

In [9]:
# Save results to CSV files
total_operational_energy_df.to_csv(os.path.join(file_path, "toe_importance.csv"))
natural_gas_df.to_csv(os.path.join(file_path, "ng_importance.csv"))
electricity_df.to_csv(os.path.join(file_path, "electricity_importance.csv"))

In [10]:
print("Feature importances based on Total Operational Energy")
total_operational_energy_df.sort_values(by="Correlation", ascending=False)

Feature importances based on Total Operational Energy


Unnamed: 0,Feature Name,Relative Importance (LR),Feature Importance (RF),Permutation Importance (RF),Correlation
10,dhw_flow_rate,0.255423,0.355885,6993906.0,0.595097
2,heating_set_point,0.233883,0.297573,6395717.0,0.544912
7,lighting_density,0.159317,0.140162,4302153.0,0.371184
6,equipment_density,0.158513,0.137394,4287613.0,0.369311
1,infiltration,0.089172,0.04939,2358200.0,0.207758
8,occupant_density,0.031959,0.006432,791314.1,0.074459
4,heating_cop,0.029628,0.00779,744134.8,0.069028
3,cooling_set_point,0.019866,0.002448,455105.7,0.046285
5,cooling_cop,0.009412,0.001005,202158.5,0.021929
9,scheduled_ventilation_ach,0.007262,0.000873,145057.1,0.01692


In [11]:
# Select the most important parameters based on "Total Operational Energy"
selected_params = [
    'dhw_flow_rate',
    'heating_set_point',
    'lighting_density',
    'equipment_density',
    'infiltration',
]

print(selected_params)

['dhw_flow_rate', 'heating_set_point', 'lighting_density', 'equipment_density', 'infiltration']


# END