# Bayesian Optimization: Dormitory

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, f1_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
%matplotlib inline

In [None]:
data_directory = r"Local Directory"
mydf = pd.read_csv(os.path.join(data_directory, "optimization_results.csv"))
mydf = mydf.drop(columns="Unnamed: 0")
mydf.head()

# Baseline Combo

In [None]:
baseline_id = "S1563"
baseline_combo = mydf[mydf["Scenario"]==baseline_id]
baseline_combo

# Choosing the best parameter combination

# 1) Deterministic Calibration

Choose the scenario with the least error.

In [None]:
mydf.sort_values(by="TOE_CV-RMSE", ascending=True)[:10]

# 2) Probabilistic Calibration: Naive Bayes Classifier

Label scenarios based on TOE_CV-RMSE

In [None]:
data = mydf.copy()

# Define a random error rate for the decision boundary
decision_boundary = 0.155

# Label scenarios based on the CV-RMSE and the decision boundary
for i in range(data.shape[0]):
    
    if data.loc[i, "TOE_CV-RMSE"] < decision_boundary:
        data.loc[i, "Label"] = 1
    else:
        data.loc[i, "Label"] = 0
        
data["Label"].value_counts()

In [None]:
necessary_columns = [
    'Scenario', 'dhw_flow_rate', 'equipment_density', 
    'heating_set_point', 'infiltration', 'lighting_density',
    'TOE_CV-RMSE', 'TOE_MAPE', 'Label',
]

new_df = data[necessary_columns]
new_df.head()

## 2.1) Choosing from available scenarios

In [None]:
# Scale data
X = new_df.drop(columns=["Scenario", "TOE_CV-RMSE", "TOE_MAPE", "Label"])
y = new_df["Label"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

bayes = GaussianNB()
bayes.fit(X_scaled, y)
preds = bayes.predict(X_scaled)

# Count occurrences of 1 and 0 in preds
counts = np.bincount(preds.astype(int))
# Print counts
print("Counts of 0:", counts[0])
print("Counts of 1:", counts[1])

print("\n", classification_report(y, preds))

In [None]:
fig, ax = plt.subplots(figsize=(6, 6), dpi=150)
labels = [1, 0]
cm = confusion_matrix(y, preds, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

# Increase text size for labels and tick labels
disp.plot(ax=ax, xticks_rotation='horizontal', cmap='viridis')
ax.tick_params(axis='both', which='major', labelsize=12)
ax.set_xlabel('Predicted Label', fontsize=14)  
ax.set_ylabel('True Label', fontsize=14) 
plt.show()

In [None]:
# Scenario probabilitites
proba_df = pd.DataFrame(bayes.predict_proba(X_scaled), columns=['Probability_Class_0', 'Probability_Class_1'])
result_df = pd.concat([new_df.reset_index(drop=True), proba_df], axis=1)
sorted_df = result_df.sort_values(by="Probability_Class_1", ascending=False)
sorted_df.head(10)

# 2.2.) Parametric estimation: mean and variance

In [None]:
# Expected mean and variances for each parameter: Label==0
print("LABEL==0 >>>")
param_list = X.columns
for i in range(len(mean_list[0])):
    mean = mean_list[0][i]
    variance = var_list[0][i]

    # Create a normal distribution for the current parameter
    dist = norm(loc=mean, scale=np.sqrt(variance))

    # Calculate the expected value (mean) for the current parameter
    expected_value = dist.mean()
    
    if param_list[i] == "dhw_flow_rate":
        print(f"Expected value (mean) for Parameter {param_list[i]}: {expected_value}")
    else:
        print(f"Expected value (mean) for Parameter {param_list[i]}: {expected_value:.3f}")
        
# Expected mean and variances for each parameter: Label==1
print("\nLABEL==1 >>>")
for i in range(len(mean_list[1])):
    mean = mean_list[1][i]
    variance = var_list[1][i]

    # Create a normal distribution for the current parameter
    dist = norm(loc=mean, scale=np.sqrt(variance))

    # Calculate the expected value (mean) for the current parameter
    expected_value = dist.mean()
    
    if param_list[i] == "dhw_flow_rate":
        print(f"Expected value (mean) for Parameter {param_list[i]}: {expected_value}")
    else:
        print(f"Expected value (mean) for Parameter {param_list[i]}: {expected_value:.3f}")

# Discussion: Imbalanced Data

In [None]:
new_df["Label"].value_counts()

In [None]:
percentile_list = [0.05, 0.10, 0.15, 0.25, 0.50, 0.75, 0.95]
new_df["TOE_CV-RMSE"].describe(percentiles=percentile_list)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Create a kernel density estimate plot (PDF) for the "TOE_CV-RMSE" column
plt.figure(figsize=(8, 4), dpi=150)
sns.kdeplot(new_df["TOE_CV-RMSE"], fill=True)

# Plot a red dashed line for the Xth percentile corresponding to 0.15 kWh
decision_boundary = 0.15
percentile_x = stats.percentileofscore(new_df["TOE_CV-RMSE"], decision_boundary)
plt.axvline(decision_boundary, color='red', linestyle='--', label=f'{percentile_x:.0f}th Percentile: {decision_boundary}')

plt.xlabel("TOE CV-RMSE")
plt.ylabel("Density")
#plt.title("Probability Density Function (PDF) of TOE_CV-RMSE")
plt.legend()
plt.grid(False)
plt.show()

# END