# Imports

In [24]:
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from tqdm.notebook import tqdm as tqdm_notebook
from sklearn.preprocessing import MinMaxScaler

# Data Preperation

```
# Exploratory Data Analysis
```



In [25]:
print(torch.__version__)

2.3.0+cu121


In [26]:
df = pd.read_csv("/content/household_power_consumption_household_power_consumption.csv")

In [27]:
df.head()
df.shape #9 columns: 8 independant & 1 dependant variable

(114219, 9)

In [28]:
# df.dtypes
df.infer_objects().dtypes

Date                      object
Time                      object
Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object

In [29]:
len(df.columns)

9

In [32]:
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y')
df['Time'] = df['Time'].astype(str)
df['Time'] = pd.to_timedelta(df['Time'])
for i in range(2, len(df.columns)):
    try:
        df[df.columns[i]] = pd.to_numeric(df[df.columns[i]], errors='coerce')
        print(f"Converted column {i} to numeric")
    except Exception as e:
        print(f"Could not convert column {i}: {e}")

Converted column 2 to numeric
Converted column 3 to numeric
Converted column 4 to numeric
Converted column 5 to numeric
Converted column 6 to numeric
Converted column 7 to numeric
Converted column 8 to numeric


In [33]:
df.infer_objects().dtypes

Date                      datetime64[ns]
Time                     timedelta64[ns]
Global_active_power              float64
Global_reactive_power            float64
Voltage                          float64
Global_intensity                 float64
Sub_metering_1                   float64
Sub_metering_2                   float64
Sub_metering_3                   float64
dtype: object

In [36]:
df.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,2007-01-01,0 days 00:00:00,2.58,0.136,241.97,10.6,0.0,0.0,0.0
1,2007-01-01,0 days 00:01:00,2.552,0.1,241.75,10.4,0.0,0.0,0.0
2,2007-01-01,0 days 00:02:00,2.55,0.1,241.64,10.4,0.0,0.0,0.0
3,2007-01-01,0 days 00:03:00,2.55,0.1,241.71,10.4,0.0,0.0,0.0
4,2007-01-01,0 days 00:04:00,2.554,0.1,241.98,10.4,0.0,0.0,0.0


In [37]:
columns = list(df.columns)
columns.remove('Global_active_power')
columns.remove('Global_reactive_power')
columns.extend(['Global_active_power', 'Global_reactive_power'])

df = df[columns]

In [38]:
df.head()

Unnamed: 0,Date,Time,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Global_active_power,Global_reactive_power
0,2007-01-01,0 days 00:00:00,241.97,10.6,0.0,0.0,0.0,2.58,0.136
1,2007-01-01,0 days 00:01:00,241.75,10.4,0.0,0.0,0.0,2.552,0.1
2,2007-01-01,0 days 00:02:00,241.64,10.4,0.0,0.0,0.0,2.55,0.1
3,2007-01-01,0 days 00:03:00,241.71,10.4,0.0,0.0,0.0,2.55,0.1
4,2007-01-01,0 days 00:04:00,241.98,10.4,0.0,0.0,0.0,2.554,0.1


In [40]:
df_group_1 = df[df['Sub_metering_1'] != 0][['Date', 'Time', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Global_active_power', 'Global_reactive_power']]
df_group_2 = df[df['Sub_metering_2'] != 0][['Date', 'Time', 'Voltage', 'Global_intensity', 'Sub_metering_2', 'Global_active_power', 'Global_reactive_power']]
df_group_3 = df[df['Sub_metering_3'] != 0][['Date', 'Time', 'Voltage', 'Global_intensity', 'Sub_metering_3', 'Global_active_power', 'Global_reactive_power']]

In [41]:
# df_sub_metering_1.describe()
# df_sub_metering_2.describe()
df_group_3.describe()

Unnamed: 0,Date,Time,Voltage,Global_intensity,Sub_metering_3,Global_active_power,Global_reactive_power
count,44368,44368,44364.0,44363.0,44363.0,44364.0,44364.0
mean,2007-04-15 01:25:33.213126400,0 days 13:52:35.183465560,238.986213,10.001145,17.410612,2.379899,0.136574
min,2007-01-01 00:00:00,0 days 00:00:00,23.0,1.0,1.0,0.248,0.0
25%,2007-01-28 00:00:00,0 days 09:30:00,237.08,6.0,17.0,1.436,0.046
50%,2007-02-28 00:00:00,0 days 13:50:00,239.09,7.8,18.0,1.876,0.11
75%,2007-07-02 00:00:00,0 days 18:52:00,241.1,12.8,18.0,3.046,0.21
max,2007-12-03 00:00:00,0 days 23:59:00,249.76,46.4,20.0,10.67,0.862
std,,0 days 05:46:08.219326181,3.251082,5.309725,1.253787,1.230224,0.124969


# Data Processing

# ARIMA


# SARIMA

# LSTM

In [None]:
def move_sliding_window(data, window_size, inputs_cols_indices, label_col_index):

    inputs = np.zeros((len(data) - window_size, window_size, len(inputs_cols_indices)))
    labels = np.zeros(len(data) - window_size)

    for i in range(window_size, len(data)):
        inputs[i - window_size] = data[i - window_size : i, inputs_cols_indices]
        labels[i - window_size] = data[i, label_col_index]
    inputs = inputs.reshape(-1, window_size, len(inputs_cols_indices))
    labels = labels.reshape(-1, 1)
    print(inputs.shape, labels.shape)

    return inputs, labels

In [None]:
label_col_index = 0
inputs_cols_indices = range(7)

window_size = 90

label_scalers = {}

train_x = []
test_x = {}
test_y = {}


# GRU