# âš¡ Day 2 â€” Data Preprocessing & Feature Engineering
## Energy Consumption Forecasting | Claysys AI Hackathon 2026

**Date:** February 20, 2026  
**Objective:** Clean the dataset, handle missing values, resample to hourly frequency, and engineer rich features for ML/DL models.

---

## 1. Setup & Imports

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from src.data_loader import load_raw_data, resample_data, split_train_test
from src.preprocessing import (
    clean_data, remove_outliers, add_time_features,
    add_lag_features, add_rolling_features, add_energy_derived_features, normalize
)

plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams.update({'figure.dpi': 120, 'font.size': 10})
print('âœ… Setup complete for Day 2')

## 2. Load Raw Data

In [None]:
DATA_PATH = '../data/raw/household_power_consumption.txt'
df_raw = load_raw_data(DATA_PATH, verbose=True)

## 3. Handle Missing Values

In [None]:
# Linear interpolation is ideal for time series â€” preserves temporal continuity
df_clean = clean_data(df_raw, strategy='interpolate')

print(f'\nRemaining NaN after cleaning: {df_clean.isna().sum().sum()}')

## 4. Resample to Hourly Frequency

In [None]:
# Resample minute-level â†’ hourly averages (reduces noise, more practical for forecasting)
df_hourly = resample_data(df_clean, freq='h')
df_hourly = df_hourly.dropna()  # Drop any remaining NaN after resampling

print(f'\nHourly dataset shape: {df_hourly.shape}')
df_hourly.head()

## 5. Outlier Detection & Removal

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].boxplot(df_hourly['Global_active_power'].dropna(), vert=False)
axes[0].set_title('Before Outlier Removal', fontweight='bold')
axes[0].set_xlabel('Global Active Power (kW)')

df_hourly = remove_outliers(df_hourly, 'Global_active_power', z_thresh=4.0)

axes[1].boxplot(df_hourly['Global_active_power'].dropna(), vert=False)
axes[1].set_title('After Outlier Removal', fontweight='bold')
axes[1].set_xlabel('Global Active Power (kW)')

plt.tight_layout()
plt.savefig('../reports/figures/outlier_removal.png', bbox_inches='tight')
plt.show()
print(f'Dataset shape after outlier removal: {df_hourly.shape}')

## 6. Feature Engineering

In [None]:
# Step 1: Add domain-knowledge derived features
df_feat = add_energy_derived_features(df_hourly)

# Step 2: Add calendar + cyclical time features
df_feat = add_time_features(df_feat)

# Step 3: Add lag features (hourly lags: 1h, 2h, 3h, 6h, 12h, 24h, 48h, 168h=1week)
df_feat = add_lag_features(df_feat, target_col='Global_active_power',
                            lags=[1, 2, 3, 6, 12, 24, 48, 168])

# Step 4: Add rolling statistics
df_feat = add_rolling_features(df_feat, target_col='Global_active_power',
                                windows=[3, 6, 12, 24, 48, 168])

# Drop rows with NaN caused by lag/rolling features
df_feat = df_feat.dropna()

print(f'\nFinal feature set shape: {df_feat.shape}')
print(f'Total features: {df_feat.shape[1]}')
print(f'\nAll features:\n{list(df_feat.columns)}')

## 7. Feature Importance Preview (Correlation with Target)

In [None]:
numeric_cols = df_feat.select_dtypes(include=[np.number]).columns
corr_with_target = df_feat[numeric_cols].corr()['Global_active_power'].drop('Global_active_power')
corr_sorted = corr_with_target.abs().sort_values(ascending=False).head(20)

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#F44336' if v > 0 else '#2196F3' for v in corr_with_target[corr_sorted.index]]
ax.barh(corr_sorted.index[::-1], corr_sorted.values[::-1], color=colors[::-1], alpha=0.85)
ax.set_title('Top 20 Features by Correlation with Global Active Power', fontweight='bold')
ax.set_xlabel('|Pearson Correlation|')
ax.axvline(0.5, color='gray', linestyle='--', linewidth=1, label='0.5 threshold')
ax.legend()
plt.tight_layout()
plt.savefig('../reports/figures/feature_correlation_target.png', bbox_inches='tight')
plt.show()

## 8. Train / Test Split

In [None]:
# Hold out last 3 months for final testing
train_df, test_df = split_train_test(df_feat, test_months=3)

print(f'\nTrain shape : {train_df.shape}')
print(f'Test shape  : {test_df.shape}')

# Visualize the split
fig, ax = plt.subplots(figsize=(14, 4))
ax.plot(train_df.index, train_df['Global_active_power'], label='Train', color='steelblue', linewidth=0.7)
ax.plot(test_df.index, test_df['Global_active_power'], label='Test (hold-out)', color='tomato', linewidth=0.7)
ax.axvline(test_df.index.min(), color='black', linestyle='--', linewidth=1.5, label='Split point')
ax.set_title('Train / Test Split', fontweight='bold')
ax.set_ylabel('Global Active Power (kW)')
ax.legend()
plt.tight_layout()
plt.savefig('../reports/figures/train_test_split.png', bbox_inches='tight')
plt.show()

## 9. Normalize Features

In [None]:
import joblib
from pathlib import Path

# Select numeric columns for scaling (exclude encoded categoricals)
scale_cols = [c for c in df_feat.select_dtypes(include=[np.number]).columns
              if c not in ['is_weekend', 'hour', 'dayofweek', 'month', 'quarter', 'year', 'dayofyear', 'weekofyear']]

train_scaled, test_scaled, scaler = normalize(train_df, test_df, columns=scale_cols, method='minmax')

# Save the scaler for use in later notebooks
Path('../models').mkdir(exist_ok=True)
joblib.dump(scaler, '../models/minmax_scaler.pkl')
print('ðŸ’¾ Scaler saved to models/minmax_scaler.pkl')

## 10. Save Processed Data

In [None]:
Path('../data/processed').mkdir(exist_ok=True)

# Save full feature set (unscaled â€” models can scale internally)
df_feat.to_csv('../data/processed/features_hourly.csv')
train_df.to_csv('../data/processed/train.csv')
test_df.to_csv('../data/processed/test.csv')

print('âœ… Processed datasets saved:')
print('   â†’ data/processed/features_hourly.csv')
print('   â†’ data/processed/train.csv')
print('   â†’ data/processed/test.csv')
print(f'\nðŸŽ‰ Day 2 Complete! {df_feat.shape[1]} features engineered.')
print('   Ready for Day 3: Baseline Statistical Models (ARIMA, Holt-Winters)')