# Predicting the heating and cooling loads of a residential structure, as well as the inherent efficiency of the structure based on the input data. 

### How to use the model
Run the code below. When prompted, input the following data feature values: 
* Relative Compactness
* Surface Area
* Wall Area
* Roof Area
* Overall Height
* Orientation
* Glazing Area
* Glazing Area Distribution

The model will use a Random Forest regressor model to predict the heating load and cooling load of the structure, and will use Logistic Regression to predict whether or not the structure is efficient or inefficient. That is to say, above or below the mean distribution of features the model was trained on. 

In [7]:
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [8]:
df = pd.read_csv('data/energy_efficiency_data.csv')

In [9]:
y_heating = df['Heating_Load']
y_cooling = df['Cooling_Load']
# Calculate the Means
heating_mean = y_heating.mean()
cooling_mean = y_cooling.mean()

# Create Binary Classes
df['Heating_Load_Class'] = np.where(y_heating > heating_mean, 1, 0)
df['Cooling_Load_Class'] = np.where(y_cooling > cooling_mean, 1, 0)

# Drop the original Heating_Load and Cooling_Load columns
X = df.drop(columns=['Heating_Load', 'Cooling_Load', 'Heating_Load_Class', 'Cooling_Load_Class'])

# Normalize the Features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Prepare the data for model training
y_heating_class = df['Heating_Load_Class']
y_cooling_class = df['Cooling_Load_Class']

In [10]:
# Split the data for heating load classification
X_train_heating, X_test_heating, y_train_heating_class, y_test_heating_class = train_test_split(X, y_heating_class, test_size=0.2, random_state=42)

# Split the data for cooling load classification
X_train_cooling, X_test_cooling, y_train_cooling_class, y_test_cooling_class = train_test_split(X, y_cooling_class, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_heating_scaled = scaler.fit_transform(X_train_heating)
X_test_heating_scaled = scaler.transform(X_test_heating)
X_train_cooling_scaled = scaler.fit_transform(X_train_cooling)
X_test_cooling_scaled = scaler.transform(X_test_cooling)


# Training a Logistic Regression model to predict if a structure is efficient or inefficient

In [11]:

# Train the Logistic Regression model for heating load classification
log_reg_heating = LogisticRegression(random_state=42)
log_reg_heating.fit(X_train_heating_scaled, y_train_heating_class)

# Train the Logistic Regression model for cooling load classification
log_reg_cooling = LogisticRegression(random_state=42)
log_reg_cooling.fit(X_train_cooling_scaled, y_train_cooling_class)

# Predict and evaluate for heating load classification
y_pred_heating_log_reg = log_reg_heating.predict(X_test_heating_scaled)
# Predict and evaluate for cooling load classification
y_pred_cooling_log_reg = log_reg_cooling.predict(X_test_cooling_scaled)
# Cross-validation for heating load classification
cv_scores_heating_log_reg = cross_val_score(log_reg_heating, X_train_heating_scaled, y_train_heating_class, cv=5)
# Cross-validation for cooling load classification
cv_scores_cooling_log_reg = cross_val_score(log_reg_cooling, X_train_cooling_scaled, y_train_cooling_class, cv=5)

# Try the model on a new data entry!

Run the following block of code to do manual feature entry and predictions on a new structure. 

In [15]:

# Train Random Forest Regressors for numeric predictions
rf_heating = RandomForestRegressor(n_estimators=100, random_state=42)
rf_heating.fit(X_train_reg_scaled, y_train_heating_reg)

rf_cooling = RandomForestRegressor(n_estimators=100, random_state=42)
rf_cooling.fit(X_train_reg_scaled, y_train_cooling_reg)

# Split the data for classification
X_train_class, X_test_class, y_train_heating_class, y_test_heating_class = train_test_split(X_normalized, df['Heating_Load_Class'], test_size=0.2, random_state=42)
_, _, y_train_cooling_class, y_test_cooling_class = train_test_split(X_normalized, df['Cooling_Load_Class'], test_size=0.2, random_state=42)

# Train Logistic Regression models for classification
log_reg_heating = LogisticRegression(random_state=42)
log_reg_heating.fit(X_train_class, y_train_heating_class)

log_reg_cooling = LogisticRegression(random_state=42)
log_reg_cooling.fit(X_train_class, y_train_cooling_class)

# Function to get user input for a new data entry
def get_user_input():
    print("Please enter the following data fields:")
    relative_compactness = float(input("Relative Compactness: "))
    surface_area = float(input("Surface Area: "))
    wall_area = float(input("Wall Area: "))
    roof_area = float(input("Roof Area: "))
    overall_height = float(input("Overall Height: "))
    orientation = int(input("Orientation (integer value): "))
    glazing_area = float(input("Glazing Area: "))
    glazing_area_distribution = int(input("Glazing Area Distribution (integer value): "))

    new_data = [
        relative_compactness, surface_area, wall_area,
        roof_area, overall_height, orientation,
        glazing_area, glazing_area_distribution
    ]
    return new_data

# Function to predict efficiency and numeric values based on user input
def predict_efficiency_and_loads(new_data):
    # Ensure new_data is a numpy array and has the correct shape
    new_data = np.array(new_data).reshape(1, -1)
    
    # Normalize the new input data
    new_data_normalized = scaler.transform(new_data)
    
    # Predict using the trained Random Forest regression models
    predicted_heating_load = rf_heating.predict(new_data_normalized)[0]
    predicted_cooling_load = rf_cooling.predict(new_data_normalized)[0]
    
    # Predict probabilities using the trained logistic regression models
    #predicted_heating_class_prob = log_reg_heating.predict_proba(new_data_normalized)[0][1]  # Probability of being efficient
    #predicted_cooling_class_prob = log_reg_cooling.predict_proba(new_data_normalized)[0][1]  # Probability of being efficient
    
    # Classify based on whether predicted values are above or below the means
    heating_efficiency = 'Efficient' if predicted_heating_load <= heating_mean else 'Inefficient'
    cooling_efficiency = 'Efficient' if predicted_cooling_load <= cooling_mean else 'Inefficient'
    
    # Output predictions and classifications
    result = {
        'Predicted Heating Load': predicted_heating_load,
        'Predicted Cooling Load': predicted_cooling_load,
        'Heating Efficiency': heating_efficiency,
        'Cooling Efficiency': cooling_efficiency
    }
    
    return result

# Get user input
new_data_entry = get_user_input()

# Predict efficiency and numeric values for the new data entry
prediction_results = predict_efficiency_and_loads(new_data_entry)

# Print the prediction results
print("Prediction Results:")
for key, value in prediction_results.items():
    print(f"{key}: {value}")


Please enter the following data fields:
Relative Compactness: .76
Surface Area: 661.5
Wall Area: 416.5
Roof Area: 122.5
Overall Height: 7
Orientation (integer value): 4
Glazing Area: .1
Glazing Area Distribution (integer value): 4
Prediction Results:
Predicted Heating Load: 32.925100000000015
Predicted Cooling Load: 34.07199999999998
Heating Efficiency: Inefficient
Cooling Efficiency: Inefficient


## Cross validation scores on these models

In [16]:
# Cross-validation for heating load prediction
cv_scores_heating_rf = cross_val_score(rf_heating, X_train_reg_scaled, y_train_heating_reg, cv=5, scoring='r2')
print("Cross-validated R2 scores for Heating Load Prediction (Random Forest):", cv_scores_heating_rf)
print("Mean cross-validated R2 score for Heating Load Prediction (Random Forest):", np.mean(cv_scores_heating_rf))

# Cross-validation for cooling load prediction
cv_scores_cooling_rf = cross_val_score(rf_cooling, X_train_reg_scaled, y_train_cooling_reg, cv=5, scoring='r2')
print("Cross-validated R2 scores for Cooling Load Prediction (Random Forest):", cv_scores_cooling_rf)
print("Mean cross-validated R2 score for Cooling Load Prediction (Random Forest):", np.mean(cv_scores_cooling_rf))

Cross-validated R2 scores for Heating Load Prediction (Random Forest): [0.99773455 0.99711065 0.99736547 0.99780473 0.99658358]
Mean cross-validated R2 score for Heating Load Prediction (Random Forest): 0.9973197960731153
Cross-validated R2 scores for Cooling Load Prediction (Random Forest): [0.97176734 0.96922604 0.97091524 0.95813164 0.96808994]
Mean cross-validated R2 score for Cooling Load Prediction (Random Forest): 0.9676260412022117


In [17]:

# Cross-validation for heating load classification
cv_scores_heating_log_reg = cross_val_score(log_reg_heating, X_train_class, y_train_heating_class, cv=5, scoring='accuracy')
print("Cross-validated accuracy for Heating Load Classification (Logistic Regression):", cv_scores_heating_log_reg)
print("Mean cross-validated accuracy for Heating Load Classification (Logistic Regression):", np.mean(cv_scores_heating_log_reg))

# Cross-validation for cooling load classification
cv_scores_cooling_log_reg = cross_val_score(log_reg_cooling, X_train_class, y_train_cooling_class, cv=5, scoring='accuracy')
print("Cross-validated accuracy for Cooling Load Classification (Logistic Regression):", cv_scores_cooling_log_reg)
print("Mean cross-validated accuracy for Cooling Load Classification (Logistic Regression):", np.mean(cv_scores_cooling_log_reg))


Cross-validated accuracy for Heating Load Classification (Logistic Regression): [0.98373984 0.99186992 0.95934959 0.99186992 0.99180328]
Mean cross-validated accuracy for Heating Load Classification (Logistic Regression): 0.9837265093962415
Cross-validated accuracy for Cooling Load Classification (Logistic Regression): [1.         0.98373984 0.98373984 0.99186992 0.97540984]
Mean cross-validated accuracy for Cooling Load Classification (Logistic Regression): 0.9869518859123018


In [18]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Predict and evaluate for heating load classification
y_pred_heating_log_reg = log_reg_heating.predict(X_test_class)
print("Heating Load Classification Report (Logistic Regression):")
print(classification_report(y_test_heating_class, y_pred_heating_log_reg))
print("Heating Load Accuracy (Logistic Regression):", accuracy_score(y_test_heating_class, y_pred_heating_log_reg))
print("Heating Load Precision (Logistic Regression):", precision_score(y_test_heating_class, y_pred_heating_log_reg))
print("Heating Load Recall (Logistic Regression):", recall_score(y_test_heating_class, y_pred_heating_log_reg))
print("Heating Load F1-Score (Logistic Regression):", f1_score(y_test_heating_class, y_pred_heating_log_reg))

# Predict and evaluate for cooling load classification
y_pred_cooling_log_reg = log_reg_cooling.predict(X_test_class)
print("Cooling Load Classification Report (Logistic Regression):")
print(classification_report(y_test_cooling_class, y_pred_cooling_log_reg))
print("Cooling Load Accuracy (Logistic Regression):", accuracy_score(y_test_cooling_class, y_pred_cooling_log_reg))
print("Cooling Load Precision (Logistic Regression):", precision_score(y_test_cooling_class, y_pred_cooling_log_reg))
print("Cooling Load Recall (Logistic Regression):", recall_score(y_test_cooling_class, y_pred_cooling_log_reg))
print("Cooling Load F1-Score (Logistic Regression):", f1_score(y_test_cooling_class, y_pred_cooling_log_reg))


Heating Load Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        75
           1       0.98      1.00      0.99        79

    accuracy                           0.99       154
   macro avg       0.99      0.99      0.99       154
weighted avg       0.99      0.99      0.99       154

Heating Load Accuracy (Logistic Regression): 0.987012987012987
Heating Load Precision (Logistic Regression): 0.9753086419753086
Heating Load Recall (Logistic Regression): 1.0
Heating Load F1-Score (Logistic Regression): 0.9875
Cooling Load Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        74
           1       0.98      1.00      0.99        80

    accuracy                           0.99       154
   macro avg       0.99      0.99      0.99       154
weighted avg       0.99      0.99      0.99       154

Coo