In [9]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [23]:
#data2=pd.read_csv('smart_home_dataset.csv')
url = "https://huggingface.co/datasets/panda04/smart-home-dataset/raw/main/smart_home_dataset.csv"
data2 = pd.read_csv(url)
data2.head(10)

Unnamed: 0,Unix Timestamp,Transaction_ID,Television,Dryer,Oven,Refrigerator,Microwave,Line Voltage,Voltage,Apparent Power,Energy Consumption (kWh),Month,Day of the Week,Hour of the Day,Offloading Decision
0,1577836800,1,0,0,0,1,0,237,233,1559,24.001763,January,Wednesday,0,Local
1,1577839322,2,0,1,0,0,1,232,230,1970,31.225154,January,Wednesday,0,Remote
2,1577841845,3,0,1,0,0,0,223,222,1684,70.4607,January,Wednesday,1,Remote
3,1577844368,4,1,0,1,1,0,225,224,1694,32.264043,January,Wednesday,2,Remote
4,1577846891,5,1,0,0,1,0,222,214,1889,32.728111,January,Wednesday,2,Local
5,1577849414,6,0,1,0,1,0,235,227,1503,63.93842,January,Wednesday,3,Local
6,1577851937,7,0,1,0,0,1,237,237,1739,87.59289,January,Wednesday,4,Local
7,1577854460,8,0,1,1,1,0,235,227,1996,37.796262,January,Wednesday,4,Local
8,1577856983,9,0,0,1,1,0,238,235,1549,74.55393,January,Wednesday,5,Local
9,1577859506,10,0,0,0,1,1,231,230,1771,48.763756,January,Wednesday,6,Remote


# Encoding

In [24]:
data2=data2.drop(['Transaction_ID', 'Unix Timestamp'], axis=1)

In [25]:
data2['is_peak_hour'] = data2['Hour of the Day'].apply(lambda x: 1 if (6 <= x <= 9) or (18 <= x <= 21) else 0)

In [26]:
data2['total_appliance_usage'] = data2[['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave']].sum(axis=1)

In [27]:
consumption_threshold = data2['Energy Consumption (kWh)'].quantile(0.75)
data2['is_high_consumption'] = data2['Energy Consumption (kWh)'].apply(lambda x: 1 if x > consumption_threshold else 0)

In [28]:
data2['part_of_day'] = pd.cut(data2['Hour of the Day'], 
                              bins=[0, 6, 12, 18, 24],
                              labels=['night', 'morning', 'afternoon', 'evening'])

In [29]:
data2['is_weekend'] = data2['Day of the Week'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

In [30]:
season_dict = {
    'December': 'Winter', 'January': 'Winter', 'February': 'Winter',
    'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
    'June': 'Summer', 'July': 'Summer', 'August': 'Summer',
    'September': 'Fall', 'October': 'Fall', 'November': 'Fall'
}
data2['Season'] = data2['Month'].map(season_dict)

In [31]:
data2['hour_sin'] = np.sin(2 * np.pi * data2['Hour of the Day'] / 24)
data2['hour_cos'] = np.cos(2 * np.pi * data2['Hour of the Day'] / 24)

In [32]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_to_scale = ['Line Voltage', 'Voltage', 'Apparent Power', 'Energy Consumption (kWh)']
data2[features_to_scale] = scaler.fit_transform(data2[features_to_scale])

In [33]:
appliances = ['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave']
for appliance in appliances:
    data2[f'{appliance}_efficiency_ratio'] = data2[appliance] / (data2['Energy Consumption (kWh)'] + 1e-6)  # Prevent division by zero

data2['power_factor'] = data2['Apparent Power'] / (data2['Line Voltage'] * data2['Voltage'] + 1e-6)

data2['active_appliances'] = data2[appliances].sum(axis=1)
data2['energy_per_active_appliance'] = data2['Energy Consumption (kWh)'] / (data2['active_appliances'] + 1e-6)

In [34]:
data2.columns

Index(['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave',
       'Line Voltage', 'Voltage', 'Apparent Power', 'Energy Consumption (kWh)',
       'Month', 'Day of the Week', 'Hour of the Day', 'Offloading Decision',
       'is_peak_hour', 'total_appliance_usage', 'is_high_consumption',
       'part_of_day', 'is_weekend', 'Season', 'hour_sin', 'hour_cos',
       'Television_efficiency_ratio', 'Dryer_efficiency_ratio',
       'Oven_efficiency_ratio', 'Refrigerator_efficiency_ratio',
       'Microwave_efficiency_ratio', 'power_factor', 'active_appliances',
       'energy_per_active_appliance'],
      dtype='object')

# Categorical Encoding

In [35]:
from sklearn.preprocessing import OneHotEncoder

# Columns to encode
cat_cols = ['Season', 'part_of_day', 'Day of the Week', 'Offloading Decision']

# One-Hot Encoding
data2_encoded = pd.get_dummies(data2, columns=cat_cols, drop_first=True)


In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ],
    remainder='passthrough'  # Keep other columns unchanged
)


In [37]:
data2_encoded.columns

Index(['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave',
       'Line Voltage', 'Voltage', 'Apparent Power', 'Energy Consumption (kWh)',
       'Month', 'Hour of the Day', 'is_peak_hour', 'total_appliance_usage',
       'is_high_consumption', 'is_weekend', 'hour_sin', 'hour_cos',
       'Television_efficiency_ratio', 'Dryer_efficiency_ratio',
       'Oven_efficiency_ratio', 'Refrigerator_efficiency_ratio',
       'Microwave_efficiency_ratio', 'power_factor', 'active_appliances',
       'energy_per_active_appliance', 'Season_Spring', 'Season_Summer',
       'Season_Winter', 'part_of_day_morning', 'part_of_day_afternoon',
       'part_of_day_evening', 'Day of the Week_Monday',
       'Day of the Week_Saturday', 'Day of the Week_Sunday',
       'Day of the Week_Thursday', 'Day of the Week_Tuesday',
       'Day of the Week_Wednesday', 'Offloading Decision_Remote'],
      dtype='object')

In [38]:
#  haven’t normalized newly created features like energy_per_active_appliance
final_numeric = ['energy_per_active_appliance', 'total_appliance_usage', 'active_appliances']

scaler = MinMaxScaler()
data2_encoded[final_numeric] = scaler.fit_transform(data2_encoded[final_numeric])


# Split Dataset (Train/Test/Validation)

In [39]:
from sklearn.model_selection import train_test_split

X = data2_encoded.drop('is_high_consumption', axis=1)
y = data2_encoded['is_high_consumption']

# 60% train, 20% validation, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Encoding Month to convert data into float 32

In [40]:
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

X_train['Month'] = X_train['Month'].map(month_mapping)
X_val['Month'] = X_val['Month'].map(month_mapping)
X_test['Month'] = X_test['Month'].map(month_mapping)


# Convert All Data to float32
TensorFlow works best with np.float32 (especially for deep learning models)

In [41]:
import numpy as np

X_train_tf = X_train.astype(np.float32)
X_val_tf = X_val.astype(np.float32)
X_test_tf = X_test.astype(np.float32)

y_train_tf = y_train.astype(np.float32)
y_val_tf = y_val.astype(np.float32)
y_test_tf = y_test.astype(np.float32)


# Converting to tensor datasets

In [42]:
import tensorflow as tf

train_ds = tf.data.Dataset.from_tensor_slices((X_train_tf.values, y_train_tf.values)).batch(32)
val_ds = tf.data.Dataset.from_tensor_slices((X_val_tf.values, y_val_tf.values)).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_tf.values, y_test_tf.values)).batch(32)


In [43]:
def create_sequences(X, y, time_steps=24):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X[i:(i + time_steps)]
        Xs.append(v)
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)


In [44]:
TIME_STEPS = 24 # bcz 24 hours in a day
X_train_seq, y_train_seq = create_sequences(X_train_tf.values, y_train_tf.values, TIME_STEPS)
X_val_seq, y_val_seq = create_sequences(X_val_tf.values, y_val_tf.values, TIME_STEPS)
X_test_seq, y_test_seq = create_sequences(X_test_tf.values, y_test_tf.values, TIME_STEPS)


In [45]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_seq, y_train_seq)).shuffle(1000).batch(32)
val_ds = tf.data.Dataset.from_tensor_slices((X_val_seq, y_val_seq)).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_seq, y_test_seq)).batch(32)
