# Imports

In [102]:
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from tqdm.notebook import tqdm as tqdm_notebook
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Data Preperation

```
# Exploratory Data Analysis
```



In [65]:
print(torch.__version__)

2.3.1+cpu


In [73]:
df = pd.read_csv("content/household_power_consumption_household_power_consumption.csv")

In [74]:
df.head()
df.shape #9 columns: 8 independant & 1 dependant variable

(260640, 9)

In [75]:
df.head(20)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,1/1/07,0:00:00,2.58,0.136,241.97,10.6,0,0,0.0
1,1/1/07,0:01:00,2.552,0.1,241.75,10.4,0,0,0.0
2,1/1/07,0:02:00,2.55,0.1,241.64,10.4,0,0,0.0
3,1/1/07,0:03:00,2.55,0.1,241.71,10.4,0,0,0.0
4,1/1/07,0:04:00,2.554,0.1,241.98,10.4,0,0,0.0
5,1/1/07,0:05:00,2.55,0.1,241.83,10.4,0,0,0.0
6,1/1/07,0:06:00,2.534,0.096,241.07,10.4,0,0,0.0
7,1/1/07,0:07:00,2.484,0.0,241.29,10.2,0,0,0.0
8,1/1/07,0:08:00,2.468,0.0,241.23,10.2,0,0,0.0
9,1/1/07,0:09:00,2.486,0.0,242.18,10.2,0,0,0.0


In [None]:
global_active_power = df.pop('Global_active_power')
global_reactive_power = df.pop('Global_reactive_power')
df['Global_active_power'] = global_active_power
df['Global_reactive_power'] = global_reactive_power #adding them tot he end of the result

In [76]:
# df.dtypes
df.infer_objects().dtypes

Date                      object
Time                      object
Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object

In [77]:
len(df.columns)

9

In [78]:
print(df.columns)

Index(['Date', 'Time', 'Global_active_power', 'Global_reactive_power',
       'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3'],
      dtype='object')


In [87]:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['DateTime'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Time'])

df.set_index('DateTime', inplace=True)

df.drop(['Date', 'Time'], axis=1, inplace=True)

  df['Date'] = pd.to_datetime(df['Date'])


In [91]:
for i in range(len(df.columns)):
    try:
        df[df.columns[i]] = pd.to_numeric(df[df.columns[i]], errors='coerce')
        print(f"Converted column {i} to numeric")
    except Exception as e:
        print(f"Could not convert column {i}: {e}")

Converted column 2 to numeric
Converted column 3 to numeric
Converted column 4 to numeric
Converted column 5 to numeric
Converted column 6 to numeric
Converted column 7 to numeric
Converted column 8 to numeric
Converted column 9 to numeric
Converted column 10 to numeric
Converted column 11 to numeric


In [None]:
df.infer_objects().dtypes

In [93]:
df.head(500)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,dayofweek,month,dayofyear,Hour,Minute
0,2007-01-01,0 days 00:00:00,2.580,0.136,241.97,10.6,0.0,0.0,0.0,0,1,1,0,0
1,2007-01-01,0 days 00:01:00,2.552,0.100,241.75,10.4,0.0,0.0,0.0,0,1,1,0,1
2,2007-01-01,0 days 00:02:00,2.550,0.100,241.64,10.4,0.0,0.0,0.0,0,1,1,0,2
3,2007-01-01,0 days 00:03:00,2.550,0.100,241.71,10.4,0.0,0.0,0.0,0,1,1,0,3
4,2007-01-01,0 days 00:04:00,2.554,0.100,241.98,10.4,0.0,0.0,0.0,0,1,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2007-01-01,0 days 08:15:00,2.456,0.098,241.33,10.2,0.0,0.0,0.0,0,1,1,8,15
496,2007-01-01,0 days 08:16:00,2.444,0.094,240.72,10.0,0.0,0.0,0.0,0,1,1,8,16
497,2007-01-01,0 days 08:17:00,2.438,0.094,240.49,10.0,0.0,0.0,0.0,0,1,1,8,17
498,2007-01-01,0 days 08:18:00,2.372,0.000,240.59,9.8,0.0,0.0,0.0,0,1,1,8,18


In [94]:
df.shape
df.drop(['Date', 'Time'], axis=1, inplace=True)

In [95]:
df.infer_objects().dtypes

Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dayofweek                  int32
month                      int32
dayofyear                  int32
Hour                       int64
Minute                     int64
dtype: object

In [96]:
df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,dayofweek,month,dayofyear,Hour,Minute
0,2.58,0.136,241.97,10.6,0.0,0.0,0.0,0,1,1,0,0
1,2.552,0.1,241.75,10.4,0.0,0.0,0.0,0,1,1,0,1
2,2.55,0.1,241.64,10.4,0.0,0.0,0.0,0,1,1,0,2
3,2.55,0.1,241.71,10.4,0.0,0.0,0.0,0,1,1,0,3
4,2.554,0.1,241.98,10.4,0.0,0.0,0.0,0,1,1,0,4


In [None]:
df_default = df[1::60] #Every hour analysis

In [None]:
df_default.head()

In [None]:
df_default.infer_objects().dtypes

In [None]:
tempV = df_default['Voltage']
tempV.plot()

In [None]:
tempG = df_default['Global_intensity']
tempG.plot()

In [None]:
tempP = df_default['Global_active_power']
tempP.plot()

In [None]:
tempS = df_default['Sub_metering_1']
tempS.plot()

In [None]:
tempS2 = df_default['Sub_metering_2']
tempS.plot()

In [None]:
tempS3 = df_default['Sub_metering_3']
tempS.plot()

In [98]:
df_group_1 = df_default[df_default['Sub_metering_1'] != 0][['Voltage',	'Global_intensity',	'Sub_metering_1', 'Global_active_power','Global_reactive_power']]
df_group_2 = df_default[df_default['Sub_metering_2'] != 0][['Voltage',	'Global_intensity',	'Sub_metering_2', 'Global_active_power','Global_reactive_power']]
df_group_3 = df_default[df_default['Sub_metering_3'] != 0][['Voltage',	'Global_intensity',	'Sub_metering_3', 'Global_active_power','Global_reactive_power',]]

In [99]:
# df_sub_metering_1.describe()
# df_sub_metering_2.describe()
df_group_3.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_3,dayofweek,month,dayofyear,Hour,Minute
count,86996.0,86996.0,86996.0,86996.0,86996.0,90767.0,90767.0,90767.0,90767.0,90767.0
mean,2.182119,0.137726,237.769858,9.210244,17.219355,3.007007,4.504512,120.239305,13.261626,29.494078
std,1.157982,0.123673,3.554284,4.994707,1.344115,2.038492,2.979774,86.0915,5.973373,17.355877
min,0.16,0.0,223.49,0.8,1.0,0.0,1.0,1.0,0.0,0.0
25%,1.38,0.052,235.3,5.8,17.0,1.0,2.0,50.0,9.0,14.0
50%,1.652,0.108,238.13,7.0,17.0,3.0,4.0,111.0,13.0,30.0
75%,2.696,0.208,240.33,11.4,18.0,5.0,6.0,168.0,19.0,45.0
max,10.67,1.148,249.76,46.4,20.0,6.0,12.0,340.0,23.0,59.0


In [None]:
df_group_3.plot(kind='scatter', x='Sub_metering_3', y='Global_active_power', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

df_group_2.plot(kind='scatter', x='Sub_metering_2', y='Global_active_power', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

df_group_1.plot(kind='scatter', x='Sub_metering_1', y='Global_active_power', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df_group_3['Voltage'].plot(kind='hist', bins=20, title='Voltage')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df_group_3['Global_active_power'].plot(kind='hist', bins=20, title='Voltage')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df_group_3.plot(figsize=(7,6))

# Data Processing

```
#Feature Extraction
```

In [104]:
# Create interaction features
df_default['Kitchen_laundry_interaction'] = df_default['Sub_metering_1'] * df_default['Sub_metering_2']

# Handle missing values
df_default.fillna(df.mean(), inplace=True)

# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
df_default[numerical_cols] = scaler.fit_transform(df_default[numerical_cols])

# Save the engineered features
df_default.to_csv('engineered_features.csv', index=False)

# ARIMA


# SARIMA

# LSTM

# GRU