In [None]:
# 03_feature_engineering_eda.ipynb

import sys
from pathlib import Path
import pandas as pd

# --- Fix Python path so we can import from scripts/ ---
# Current working dir = backend/notebooks/
# Go up one level → backend/
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from scripts.utils import add_time_features, add_lag_roll_features, add_ratio_features, save_df

# --- Paths ---
PROC = project_root / "data" / "processed"

# --- Load cleaned data ---
df = pd.read_csv(PROC / "cleaned_merged.csv", parse_dates=['date'])

# --- Feature Engineering ---
df = add_time_features(df, date_col='date')
df = add_lag_roll_features(df, groupby_cols=['region'], target_col='usage_cpu')
df = add_ratio_features(df)

# Handle any missing values created by lag/rolling features
df = df.fillna(method='ffill').fillna(method='bfill')

# --- Save ---
save_df(df, PROC / "feature_engineered.csv")
print("✅ Saved feature_engineered.csv with shape:", df.shape)


ModuleNotFoundError: No module named 'backend.scripts'

In [None]:
# EDA plots
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(PROC/"feature_engineered.csv", parse_dates=['date'])
# correlation heatmap (select numeric)
num_cols = df.select_dtypes(include='number').columns
plt.figure(figsize=(12,10))
sns.heatmap(df[num_cols].corr(), annot=False, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# Boxplot region-wise
plt.figure(figsize=(12,6))
sns.boxplot(x='region', y='usage_cpu', data=df)
plt.xticks(rotation=45)
plt.title("Region-wise CPU usage distribution")
plt.show()
