In [1]:
import pandas as pd
from pathlib import Path

RAW = Path("data/raw/energydata_complete.csv")
PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(RAW, parse_dates=["date"], skipinitialspace=True)
print(df.shape, df.dtypes[:5])
df.head()

(19735, 29) date          datetime64[ns]
Appliances             int64
lights                 int64
T1                   float64
RH_1                 float64
dtype: object


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
df["hour"] = df["date"].dt.hour
df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month
df["day_of_week"] = df["date"].dt.dayofweek

def get_season(m):
    if m in (12, 1, 2):  return "winter"
    if m in (3, 4, 5):   return "spring"
    if m in (6, 7, 8):   return "summer"
    return "autumn"

df["season"] = df["month"].apply(get_season)

df = pd.get_dummies(df, columns=["season"], drop_first=True)

df[["date","hour","day_of_week","month"] + [c for c in df.columns if c.startswith("season_")]].head()

Unnamed: 0,date,hour,day_of_week,month,season_winter,season_winter.1,season_winter.2,season_winter.3
0,2016-01-12 17:00:00,17,1,1,True,True,True,True
1,2016-01-12 17:10:00,17,1,1,True,True,True,True
2,2016-01-12 17:20:00,17,1,1,True,True,True,True
3,2016-01-12 17:30:00,17,1,1,True,True,True,True
4,2016-01-12 17:40:00,17,1,1,True,True,True,True


In [3]:
TARGET = "Appliances"
lags = [1, 6, 24, 72, 144]
for L in lags:
    df[f"{TARGET}_lag_{L}"] = df[TARGET].shift(L)

df[f"{TARGET}_roll6_mean"] = df[TARGET].rolling(window=6).mean()    
df[f"{TARGET}_roll24_mean"] = df[TARGET].rolling(window=24).mean()  
df[f"{TARGET}_roll144_mean"] = df[TARGET].rolling(window=144).mean() 

before = len(df)
df = df.dropna().reset_index(drop=True)
after = len(df)
df.filter(regex="Appliances(_lag_|_roll)").head()

Unnamed: 0,Appliances_lag_1,Appliances_lag_6,Appliances_lag_24,Appliances_lag_72,Appliances_lag_144,Appliances_roll6_mean,Appliances_roll24_mean,Appliances_roll144_mean
0,40.0,30.0,60.0,50.0,60.0,45.0,91.25,104.097222
1,60.0,40.0,290.0,50.0,60.0,48.333333,81.666667,104.097222
2,60.0,50.0,130.0,40.0,50.0,75.0,85.0,105.208333
3,210.0,40.0,140.0,40.0,50.0,131.666667,95.0,107.5
4,380.0,40.0,240.0,40.0,60.0,186.666667,100.416667,109.652778
