# Projeto: Análise e Validação Cruzada de Modelos com Dataset de Eficiência Energética ⚡

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# semente aleatória para reprodutibilidade
SEED = 301
np.random.seed(SEED)


## Carregamento e Pré-processamento do Dataset de Eficiência Energética

### Dataset "Energy Efficiency"

O dataset "Energy Efficiency" foi coletado para prever as cargas de aquecimento e resfriamento dos edifícios a partir de diferentes características arquitetônicas e ambientais. Essas previsões são importantes para otimizar o consumo de energia e melhorar a eficiência energética dos edifícios.

### Colunas do Dataset:

- Relative Compactness: Uma medida de quão compacto é o edifício. Valores mais altos indicam edifícios mais compactos, que geralmente têm menos perda de calor.
- Surface Area: Área total da superfície externa do edifício, em metros quadrados.
- Wall Area: Área total das paredes externas do edifício, em metros quadrados.
- Roof Area: Área total do telhado do edifício, em metros quadrados.
- Overall Height: Altura total do edifício, em metros.
- Orientation: Orientação do edifício. Esta variável pode assumir valores de 2 a 5, representando diferentes orientações.
- Glazing Area: Área das janelas do edifício como uma porcentagem da área total da superfície externa.
- Glazing Area Distribution: Distribuição da área envidraçada em diferentes partes do edifício.
- Heating Load: Carga de aquecimento do edifício, em kWh/m².
- Cooling Load: Carga de resfriamento do edifício, em kWh/m².

In [37]:
# dataset de eficiência energética
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
data = pd.read_excel(url)

# renomeando as colunas 
data.columns = [
    'Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
    'Overall Height', 'Orientation', 'Glazing Area', 'Glazing Area Distribution',
    'Heating Load', 'Cooling Load'
]

data.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [38]:
# informações básicas do dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Relative Compactness       768 non-null    float64
 1   Surface Area               768 non-null    float64
 2   Wall Area                  768 non-null    float64
 3   Roof Area                  768 non-null    float64
 4   Overall Height             768 non-null    float64
 5   Orientation                768 non-null    int64  
 6   Glazing Area               768 non-null    float64
 7   Glazing Area Distribution  768 non-null    int64  
 8   Heating Load               768 non-null    float64
 9   Cooling Load               768 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 60.1 KB


In [39]:
# estatísticas descritivas
data.describe()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307195,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090204,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


In [40]:
# separando features e os targets
X = data[['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
          'Overall Height', 'Orientation', 'Glazing Area', 'Glazing Area Distribution']]
y_heating = data['Heating Load']
y_cooling = data['Cooling Load']

In [41]:
# divisao do dataset em treino e teste
X_train, X_test, y_train_heating, y_test_heating = train_test_split(X, y_heating, test_size=0.2, random_state=42)
X_train, X_test, y_train_cooling, y_test_cooling = train_test_split(X, y_cooling, test_size=0.2, random_state=42)

# função p/ treinar, testar e avaliar modelos
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    return mae

In [42]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)

# avaliando os modelos com treino/teste simples
mae_heating_dt = evaluate_model(dt, X_train, y_train_heating, X_test, y_test_heating)
mae_heating_rf = evaluate_model(rf, X_train, y_train_heating, X_test, y_test_heating)

mae_cooling_dt = evaluate_model(dt, X_train, y_train_cooling, X_test, y_test_cooling)
mae_cooling_rf = evaluate_model(rf, X_train, y_train_cooling, X_test, y_test_cooling)

print(f"MAE for Heating Load (Decision Tree): {mae_heating_dt}")
print(f"MAE for Heating Load (Random Forest): {mae_heating_rf}")

print(f"MAE for Cooling Load (Decision Tree): {mae_cooling_dt}")
print(f"MAE for Cooling Load (Random Forest): {mae_cooling_rf}")

MAE for Heating Load (Decision Tree): 0.42483116883116906
MAE for Heating Load (Random Forest): 0.35467402597402614
MAE for Cooling Load (Decision Tree): 1.15512987012987
MAE for Cooling Load (Random Forest): 1.0604350649350647


In [43]:
# cross-validation com k-fold
cv = KFold(n_splits=5, random_state=42, shuffle=True)

# avaliar o modelo usando validação cruzada
def cross_val_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    return -scores.mean()

cv_mae_heating_dt = cross_val_model(dt, X, y_heating)
cv_mae_heating_rf = cross_val_model(rf, X, y_heating)

cv_mae_cooling_dt = cross_val_model(dt, X, y_cooling)
cv_mae_cooling_rf = cross_val_model(rf, X, y_cooling)

print(f"Cross-validated MAE for Heating Load (Decision Tree): {cv_mae_heating_dt}")
print(f"Cross-validated MAE for Heating Load (Random Forest): {cv_mae_heating_rf}")

print(f"Cross-validated MAE for Cooling Load (Decision Tree): {cv_mae_cooling_dt}")
print(f"Cross-validated MAE for Cooling Load (Random Forest): {cv_mae_cooling_rf}")

Cross-validated MAE for Heating Load (Decision Tree): 0.37623941940412553
Cross-validated MAE for Heating Load (Random Forest): 0.32230495390883607
Cross-validated MAE for Cooling Load (Decision Tree): 1.121264748323572
Cross-validated MAE for Cooling Load (Random Forest): 0.9965635794924026


In [54]:
# hiperparâmetros fixos
fixed_params = {
    'n_estimators': 500,
    'max_depth': 30,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'bootstrap': True,
    'random_state': 42
}

# modelo Random Forest com hiperparâmetros fixos
rf_fixed = RandomForestRegressor(**fixed_params)

# validação cruzada k-fold
cv = KFold(n_splits=5, random_state=42, shuffle=True)

# avaliando o modelo com validação cruzada k-fold p/ Heating Load
cv_mae_heating_fixed = cross_val_model(rf_fixed, X, y_heating)

# avaliando o modelo com validação cruzada k-fold p/ Cooling Load
cv_mae_cooling_fixed = cross_val_model(rf_fixed, X, y_cooling)

print(f"Cross-validated MAE for Heating Load with fixed parameters: {cv_mae_heating_fixed}")
print(f"Cross-validated MAE for Cooling Load with fixed parameters: {cv_mae_cooling_fixed}")

Cross-validated MAE for Heating Load with fixed parameters: 0.3652400522378088
Cross-validated MAE for Cooling Load with fixed parameters: 1.1382364558128317
