
# Consumo de Energia Residencial (UCI) — Notebook
Este caderno resolve, passo a passo, os 20 itens do trabalho usando o dataset **Individual Household Electric Power Consumption**.


## 0) Setup e carregamento

In [5]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from pathlib import Path
NOTEBOOK_DIR = Path().resolve()          # geralmente .../notebooks
PROJECT_ROOT = NOTEBOOK_DIR.parent       # sobe para a raiz do repo
DATA_FILE = PROJECT_ROOT / "data" / "raw" / "household_power_consumption.txt"

df = pd.read_csv(DATA_FILE, sep=";", na_values=["?"], low_memory=False, encoding="latin-1")


# Index temporal com Date + Time
df['DateTime'] = pd.to_datetime(
    df['Date'].astype(str) + ' ' + df['Time'].astype(str),
    format='%d/%m/%Y %H:%M:%S',
    errors='coerce'
)
df = (df.dropna(subset=['DateTime'])
        .set_index('DateTime')
        .sort_index())

# Converte colunas numéricas
num_cols = ["Global_active_power","Global_reactive_power","Voltage","Global_intensity",
            "Sub_metering_1","Sub_metering_2","Sub_metering_3"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# Colunas úteis
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['weekday'] = df['Date'].dt.day_name()

print("Shape:", df.shape)
print("Período:", df.index.min(), "->", df.index.max())
print("Anos:", sorted(pd.unique(df.index.year)))


Shape: (2075259, 10)
Período: 2006-12-16 17:24:00 -> 2010-11-26 21:02:00
Anos: [np.int32(2006), np.int32(2007), np.int32(2008), np.int32(2009), np.int32(2010)]


### (1) 10 primeiras linhas

In [6]:
df.head(10)

Unnamed: 0_level_0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,weekday
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,2006-12-16,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0,Saturday
2006-12-16 17:25:00,2006-12-16,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0,Saturday
2006-12-16 17:26:00,2006-12-16,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0,Saturday
2006-12-16 17:27:00,2006-12-16,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0,Saturday
2006-12-16 17:28:00,2006-12-16,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0,Saturday
2006-12-16 17:29:00,2006-12-16,17:29:00,3.52,0.522,235.02,15.0,0.0,2.0,17.0,Saturday
2006-12-16 17:30:00,2006-12-16,17:30:00,3.702,0.52,235.09,15.8,0.0,1.0,17.0,Saturday
2006-12-16 17:31:00,2006-12-16,17:31:00,3.7,0.52,235.22,15.8,0.0,1.0,17.0,Saturday
2006-12-16 17:32:00,2006-12-16,17:32:00,3.668,0.51,233.99,15.8,0.0,1.0,17.0,Saturday
2006-12-16 17:33:00,2006-12-16,17:33:00,3.662,0.51,233.86,15.8,0.0,2.0,16.0,Saturday



### (2) Diferença: `Global_active_power` x `Global_reactive_power`
- **Ativa (kW):** energia que vira trabalho útil (consumo real).
- **Reativa (kVAR):** energia que “vai e volta” por conta de campos magnéticos/capacitivos; não faz trabalho útil.


### (3) Valores ausentes por coluna

In [7]:
df.isna().sum()

Date                         0
Time                         0
Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
weekday                      0
dtype: int64

### (4) `Date` como datetime e coluna de dia da semana (já criado no setup)

In [None]:
df[['Date','weekday']].head()

### (5) 2007: média diária de `Global_active_power`

In [None]:

df_2007 = df.loc['2007']
daily_mean_2007 = df_2007['Global_active_power'].resample('D').mean()
daily_mean_2007.head()


### (6) Gráfico de um dia (linha) — ajuste a data se quiser

In [None]:

dia = "2007-02-02"
serie = df.loc[dia, 'Global_active_power']
plt.figure()
serie.plot(title=f"Global_active_power — {dia}")
plt.xlabel("Hora"); plt.ylabel("kW")
plt.show()


### (7) Histograma de `Voltage` (olhar faixa/caudas)

In [None]:

plt.figure()
df['Voltage'].plot(kind='hist', bins=50, edgecolor='black', title='Distribuição de Voltage')
plt.xlabel('Volts'); plt.show()


### (8) Média mensal de `Global_active_power` (todo o período)

In [None]:

monthly_mean = df['Global_active_power'].resample('M').mean()
monthly_mean.head()


### (9) Dia com maior **soma diária** de `Global_active_power`

In [None]:

daily_sum = df['Global_active_power'].resample('D').sum(min_count=1)
max_day = daily_sum.idxmax()
print("Dia com maior consumo:", max_day.date(), "— soma:", daily_sum.loc[max_day])


### (10) Dias úteis x fim de semana — médias

In [None]:

is_weekend = (df.index.dayofweek >= 5)
weekend_mean = df.loc[is_weekend, "Global_active_power"].mean()
weekday_mean = df.loc[~is_weekend, "Global_active_power"].mean()
print("Fim de semana:", weekend_mean, "kW")
print("Dias úteis:", weekday_mean, "kW")


### (11) Correlação entre variáveis principais

In [None]:

df[["Global_active_power","Global_reactive_power","Voltage","Global_intensity"]].corr()


### (12) `Total_Sub_metering = 1 + 2 + 3`

In [None]:

df['Total_Sub_metering'] = df[['Sub_metering_1','Sub_metering_2','Sub_metering_3']].sum(axis=1)
df['Total_Sub_metering'].head()


### (13) Meses em que `Total_Sub_metering` (média) > `Global_active_power` (média)

In [None]:

m_total = df['Total_Sub_metering'].resample('M').mean()
m_gap   = df['Global_active_power'].resample('M').mean()
months_over = (m_total > m_gap)
months_over[months_over]


### (14) Série de `Voltage` — 2008 (se existir) ou ano disponível

In [None]:

anos = sorted(pd.unique(df.index.year))
year = 2008 if 2008 in anos else anos[-1]  # pega 2008, senão o último ano disponível
mask = (df.index.year == year)
voltage_year = df.loc[mask, 'Voltage'].resample('D').mean()

print("Ano usado no gráfico:", year)
plt.figure()
voltage_year.plot(title=f"Voltage — {year} (média diária)")
plt.ylabel("Volts"); plt.xlabel("Data")
plt.show()


### (15) Verão (JJA) vs Inverno (DJF) — médias de `Global_active_power`

In [None]:

summer = df[df.index.month.isin([6,7,8])]['Global_active_power'].mean()
winter = df[df.index.month.isin([12,1,2])]['Global_active_power'].mean()
print("Verão (JJA):", summer, "kW")
print("Inverno (DJF):", winter, "kW")


### (16) Amostra aleatória de 1% e comparação da distribuição

In [None]:

sample = df.sample(frac=0.01, random_state=42)
plt.figure()
df['Global_active_power'].plot(kind='hist', bins=60, alpha=0.5)
sample['Global_active_power'].plot(kind='hist', bins=60, alpha=0.5)
plt.legend(['Base completa','Amostra 1%']); plt.show()


### (17) Normalização Min–Max (colunas numéricas principais)

In [None]:

cols_scale = ["Global_active_power","Global_reactive_power","Voltage","Global_intensity",
              "Sub_metering_1","Sub_metering_2","Sub_metering_3","Total_Sub_metering"]
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[cols_scale] = scaler.fit_transform(df_scaled[cols_scale])
df_scaled[cols_scale].head()


### (18) K-means (k=3) em médias diárias — perfil dos clusters

In [None]:

daily_features = (df[["Global_active_power","Global_reactive_power","Voltage","Global_intensity","Total_Sub_metering"]]
                  .resample("D").mean().dropna())

scaler_d = MinMaxScaler()
X = scaler_d.fit_transform(daily_features)
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)  # n_init=10 por compatibilidade
labels = kmeans.fit_predict(X)
daily_features["cluster"] = labels

print("Qtde de dias por cluster:")
print(daily_features["cluster"].value_counts().sort_index())

print("\nPerfil médio por cluster:")
daily_features.groupby("cluster").mean().round(3)


### (19) Decomposição da série (6 meses) — opcional

In [None]:

# Executa só se statsmodels estiver disponível; evita quebrar o notebook.
try:
    from statsmodels.tsa.seasonal import seasonal_decompose
    daily_gap = df['Global_active_power'].resample('D').mean().dropna()
    # Jan–Jun do primeiro ano disponível (ou ajuste como quiser)
    first_year = int(sorted(pd.unique(daily_gap.index.year))[0])
    six_months = daily_gap.loc[f'{first_year}-01':f'{first_year}-06']
    res = seasonal_decompose(six_months, model='additive', period=7)
    fig = res.plot(); fig.set_size_inches(10,8)
    plt.show()
except Exception as e:
    print("Decomposição não executada (statsmodels ausente ou outro motivo):", e)


### (20) Regressão linear: prever `Global_active_power` usando `Global_intensity`

In [None]:

# Versão compatível com qualquer scikit-learn
X = df[["Global_intensity"]].astype(float).dropna()
y = df.loc[X.index, "Global_active_power"].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression().fit(X_train, y_train)
pred = lr.predict(X_test)

mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)  # sem 'squared=False' para evitar erro de versão
r2 = r2_score(y_test, pred)

print("Coeficiente:", lr.coef_[0])
print("Intercepto:", lr.intercept_)
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)
