# Imports

In [151]:
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from tqdm.notebook import tqdm as tqdm_notebook
from sklearn.preprocessing import MinMaxScaler

# Data Preperation

```
# Exploratory Data Analysis
```



In [152]:
print(torch.__version__)

2.3.0+cu121


In [153]:
df = pd.read_csv("/content/household_power_consumption_household_power_consumption.csv")

In [154]:
df.head()
df.shape #9 columns: 8 independant & 1 dependant variable

(260640, 9)

In [157]:
df.head(20)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,1/1/07,0:00:00,2.58,0.136,241.97,10.6,0,0,0.0
1,1/1/07,0:01:00,2.552,0.1,241.75,10.4,0,0,0.0
2,1/1/07,0:02:00,2.55,0.1,241.64,10.4,0,0,0.0
3,1/1/07,0:03:00,2.55,0.1,241.71,10.4,0,0,0.0
4,1/1/07,0:04:00,2.554,0.1,241.98,10.4,0,0,0.0
5,1/1/07,0:05:00,2.55,0.1,241.83,10.4,0,0,0.0
6,1/1/07,0:06:00,2.534,0.096,241.07,10.4,0,0,0.0
7,1/1/07,0:07:00,2.484,0.0,241.29,10.2,0,0,0.0
8,1/1/07,0:08:00,2.468,0.0,241.23,10.2,0,0,0.0
9,1/1/07,0:09:00,2.486,0.0,242.18,10.2,0,0,0.0


In [158]:
# df.dtypes
df.infer_objects().dtypes

Date                      object
Time                      object
Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object

In [159]:
len(df.columns)

9

In [160]:
print(df.columns)

Index(['Date', 'Time', 'Global_active_power', 'Global_reactive_power',
       'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3'],
      dtype='object')


In [165]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')
df['dayofweek'] = df['Date'].dt.dayofweek
df['month'] = df['Date'].dt.month
df['dayofyear'] = df['Date'].dt.dayofyear


In [166]:
df.infer_objects().dtypes

Date                     datetime64[ns]
Time                             object
Global_active_power              object
Global_reactive_power            object
Voltage                          object
Global_intensity                 object
Sub_metering_1                   object
Sub_metering_2                   object
Sub_metering_3                  float64
dayofweek                         int32
month                             int32
dayofyear                         int32
dtype: object

In [172]:
df['Time'] = df['Time'].astype(str)
df['Time'] = pd.to_timedelta(df['Time'])

for i in range(2, len(df.columns)):
    try:
        df[df.columns[i]] = pd.to_numeric(df[df.columns[i]], errors='coerce')
        print(f"Converted column {i} to numeric")
    except Exception as e:
        print(f"Could not convert column {i}: {e}")


Converted column 2 to numeric
Converted column 3 to numeric
Converted column 4 to numeric
Converted column 5 to numeric
Converted column 6 to numeric
Converted column 7 to numeric
Converted column 8 to numeric
Converted column 9 to numeric
Converted column 10 to numeric
Converted column 11 to numeric


In [179]:
df['Time'] = pd.to_timedelta(df['Time'])
df['Hour'] = df['Time'].dt.components['hours']
df['Minute'] = df['Time'].dt.components['minutes']

In [180]:
df.head(500)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,dayofweek,month,dayofyear,Hour,Minute
0,2007-01-01,0 days 00:00:00,2.580,0.136,241.97,10.6,0.0,0.0,0.0,0,1,1,0,0
1,2007-01-01,0 days 00:01:00,2.552,0.100,241.75,10.4,0.0,0.0,0.0,0,1,1,0,1
2,2007-01-01,0 days 00:02:00,2.550,0.100,241.64,10.4,0.0,0.0,0.0,0,1,1,0,2
3,2007-01-01,0 days 00:03:00,2.550,0.100,241.71,10.4,0.0,0.0,0.0,0,1,1,0,3
4,2007-01-01,0 days 00:04:00,2.554,0.100,241.98,10.4,0.0,0.0,0.0,0,1,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2007-01-01,0 days 08:15:00,2.456,0.098,241.33,10.2,0.0,0.0,0.0,0,1,1,8,15
496,2007-01-01,0 days 08:16:00,2.444,0.094,240.72,10.0,0.0,0.0,0.0,0,1,1,8,16
497,2007-01-01,0 days 08:17:00,2.438,0.094,240.49,10.0,0.0,0.0,0.0,0,1,1,8,17
498,2007-01-01,0 days 08:18:00,2.372,0.000,240.59,9.8,0.0,0.0,0.0,0,1,1,8,18


In [182]:
df.shape
df.drop(['Date', 'Time'], axis=1, inplace=True)

In [183]:
df.infer_objects().dtypes

Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dayofweek                  int32
month                      int32
dayofyear                  int32
Hour                       int64
Minute                     int64
dtype: object

In [184]:
df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,dayofweek,month,dayofyear,Hour,Minute
0,2.58,0.136,241.97,10.6,0.0,0.0,0.0,0,1,1,0,0
1,2.552,0.1,241.75,10.4,0.0,0.0,0.0,0,1,1,0,1
2,2.55,0.1,241.64,10.4,0.0,0.0,0.0,0,1,1,0,2
3,2.55,0.1,241.71,10.4,0.0,0.0,0.0,0,1,1,0,3
4,2.554,0.1,241.98,10.4,0.0,0.0,0.0,0,1,1,0,4


In [185]:
df_group_1 = df[df['Sub_metering_1'] != 0][['Global_active_power','Global_reactive_power',	'Voltage',	'Global_intensity',	'Sub_metering_1', 'dayofweek',	'month',	'dayofyear',	'Hour',	'Minute']]
df_group_2 = df[df['Sub_metering_2'] != 0][['Global_active_power','Global_reactive_power',	'Voltage',	'Global_intensity',	'Sub_metering_2', 'dayofweek',	'month',	'dayofyear',	'Hour',	'Minute']]
df_group_3 = df[df['Sub_metering_3'] != 0][['Global_active_power','Global_reactive_power',	'Voltage',	'Global_intensity',	'Sub_metering_3', 'dayofweek',	'month',	'dayofyear',	'Hour',	'Minute']]

In [186]:
# df_sub_metering_1.describe()
# df_sub_metering_2.describe()
df_group_3.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_3,dayofweek,month,dayofyear,Hour,Minute
count,86996.0,86996.0,86996.0,86996.0,86996.0,90767.0,90767.0,90767.0,90767.0,90767.0
mean,2.182119,0.137726,237.769858,9.210244,17.219355,3.108454,3.245794,83.234667,13.261626,29.494078
std,1.157982,0.123673,3.554284,4.994707,1.344115,2.051392,1.678125,51.104524,5.973373,17.355877
min,0.16,0.0,223.49,0.8,1.0,0.0,1.0,1.0,0.0,0.0
25%,1.38,0.052,235.3,5.8,17.0,1.0,2.0,37.0,9.0,14.0
50%,1.652,0.108,238.13,7.0,17.0,3.0,3.0,81.0,13.0,30.0
75%,2.696,0.208,240.33,11.4,18.0,5.0,5.0,126.0,19.0,45.0
max,10.67,1.148,249.76,46.4,20.0,6.0,6.0,181.0,23.0,59.0


# Data Processing

# ARIMA


# SARIMA

# LSTM

In [None]:
def move_sliding_window(data, window_size, inputs_cols_indices, label_col_index):

    inputs = np.zeros((len(data) - window_size, window_size, len(inputs_cols_indices)))
    labels = np.zeros(len(data) - window_size)

    for i in range(window_size, len(data)):
        inputs[i - window_size] = data[i - window_size : i, inputs_cols_indices]
        labels[i - window_size] = data[i, label_col_index]
    inputs = inputs.reshape(-1, window_size, len(inputs_cols_indices))
    labels = labels.reshape(-1, 1)
    print(inputs.shape, labels.shape)

    return inputs, labels

In [None]:
from tqdm.notebook import tqdm

label_col_index = 0
inputs_cols_indices = range(7)

window_size = 90

label_scalers = {}

train_x = []
test_x = {}
test_y = {}

num_files_for_dataset = 3  # Using 3 dataframes

for df_group in tqdm([df_group_1, df_group_2, df_group_3]):
  # Convert 'Date' and 'Time' to datetime and timedelta format respectively
  df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y')
  df['Time'] = pd.to_timedelta(df['Time'])

  # Combine 'Date' and 'Time' into 'Datetime'
  df['Datetime'] = df['Date'] + df['Time']

  # Extract relevant features from 'Datetime'
  df['hour'] = df['Datetime'].dt.hour
  df['dayofweek'] = df['Datetime'].dt.dayofweek
  df['month'] = df['Datetime'].dt.month

  # Scaling the input data
  sc = MinMaxScaler()
  data = sc.fit_transform(df.values)
  print(data)


# GRU