In [1]:
!pip install ucimlrepo



In [2]:
from ucimlrepo import fetch_ucirepo # Used to import the dataset
import pandas as pd
import numpy as np

from Models import *

# fetch dataset
dataset = fetch_ucirepo(id=235)

# data (as pandas dataframes)
X_data = dataset.data.features
y_data = dataset.data.targets

# metadata
print(dataset.metadata)

# variable information
print(dataset.variables)

  df = pd.read_csv(data_url)


{'uci_id': 235, 'name': 'Individual Household Electric Power Consumption', 'repository_url': 'https://archive.ics.uci.edu/dataset/235/individual+household+electric+power+consumption', 'data_url': 'https://archive.ics.uci.edu/static/public/235/data.csv', 'abstract': 'Measurements of electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years. Different electrical quantities and some sub-metering values are available.', 'area': 'Physics and Chemistry', 'tasks': ['Regression', 'Clustering'], 'characteristics': ['Multivariate', 'Time-Series'], 'num_instances': 2075259, 'num_features': 9, 'feature_types': ['Real'], 'demographics': [], 'target_col': None, 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2006, 'last_updated': 'Fri Mar 08 2024', 'dataset_doi': '10.24432/C58K54', 'creators': ['Georges Hebrail', 'Alice Berard'], 'intro_paper': None, 'additional_info': {'summary': 'This archiv

In [3]:
X = X_data.copy(deep=True)
categories = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
X[categories] = X[categories].apply(pd.to_numeric, errors='coerce')
for cat in categories:
    X[cat] = X[cat].interpolate()

np.sum(np.isnan(X['Global_active_power']))

0

In [4]:
# Identify where the values are NaN
is_nan = np.isnan(X['Global_active_power'])

# Find where the NaN sequences start and end
nan_runs = np.diff(np.concatenate(([0], is_nan.astype(int), [0])))
start_indices = np.where(nan_runs == 1)[0]
end_indices = np.where(nan_runs == -1)[0]

# Calculate the lengths of each run
nan_lengths = end_indices - start_indices

# Filter runs with more than 5 NaNs
long_nan_runs = [(start, length) for start, length in zip(start_indices, nan_lengths) if length > 5]

print(long_nan_runs)

X['Date'][190497+3723], X['Date'][1309386]
X['Date'][0], X['Date'][len(X)-1]

[]


('16/12/2006', '26/11/2010')

In [5]:
PF = np.cos(np.arctan(X['Global_reactive_power'] / X['Global_active_power']))
X.insert(4, 'Power_factor', PF, True)
X.describe(include='all')

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Power_factor,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2075259,2075259,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0
unique,1442,1440,,,,,,,,
top,6/12/2008,17:24:00,,,,,,,,
freq,1440,1442,,,,,,,,
mean,,,1.09028,0.1236492,0.9638261,240.8328,4.621481,1.109485,1.289229,6.442386
std,,,1.052628,0.1124194,0.05786603,3.237763,4.424361,6.115843,5.786613,8.41586
min,,,0.076,0.0,0.5558553,223.2,0.2,0.0,0.0,0.0
25%,,,0.31,0.048,0.9519304,238.99,1.4,0.0,0.0,0.0
50%,,,0.614,0.1,0.9934134,241.0,2.751585,0.0,0.0,1.0
75%,,,1.528,0.194,0.9997095,242.87,6.4,0.0,1.0,17.0


In [6]:
X['Power_factor'].quantile(q=0.01)

0.7568230077691596

In [7]:
import matplotlib.pyplot as plt
import datetime
from matplotlib.ticker import MaxNLocator

# Adjust Date_Time column for sensible plots
DateTime = X['Date'].str.cat(X['Time'].values.astype(str), sep=' ')
X.insert(0, 'Date_Time', DateTime, True) #includes Date_time variable
X = X.drop('Date', axis=1) #removes date column
X = X.drop('Time', axis=1) #removes time column
X.describe(include='all')

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Power_factor,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2075259,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0
unique,2075259,,,,,,,,
top,16/12/2006 17:24:00,,,,,,,,
freq,1,,,,,,,,
mean,,1.09028,0.1236492,0.9638261,240.8328,4.621481,1.109485,1.289229,6.442386
std,,1.052628,0.1124194,0.05786603,3.237763,4.424361,6.115843,5.786613,8.41586
min,,0.076,0.0,0.5558553,223.2,0.2,0.0,0.0,0.0
25%,,0.31,0.048,0.9519304,238.99,1.4,0.0,0.0,0.0
50%,,0.614,0.1,0.9934134,241.0,2.751585,0.0,0.0,1.0
75%,,1.528,0.194,0.9997095,242.87,6.4,0.0,1.0,17.0


In [8]:
GAE = X['Global_active_power']*(1000/60) - X['Sub_metering_1'] - X['Sub_metering_2'] - X['Sub_metering_3']
X.insert(1, 'GAE', GAE, True)

In [9]:
X.describe(include='all')

Unnamed: 0,Date_Time,GAE,Global_active_power,Global_reactive_power,Power_factor,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2075259,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0
unique,2075259,,,,,,,,,
top,16/12/2006 17:24:00,,,,,,,,,
freq,1,,,,,,,,,
mean,,9.330226,1.09028,0.1236492,0.9638261,240.8328,4.621481,1.109485,1.289229,6.442386
std,,9.546528,1.052628,0.1124194,0.05786603,3.237763,4.424361,6.115843,5.786613,8.41586
min,,-2.4,0.076,0.0,0.5558553,223.2,0.2,0.0,0.0,0.0
25%,,3.833333,0.31,0.048,0.9519304,238.99,1.4,0.0,0.0,0.0
50%,,5.533333,0.614,0.1,0.9934134,241.0,2.751585,0.0,0.0,1.0
75%,,10.46667,1.528,0.194,0.9997095,242.87,6.4,0.0,1.0,17.0


In [None]:
# Define categories for each figure
fig1_categories = ['Global_active_power', 'Global_reactive_power', 'Power_factor']
fig2_categories = ['GAE']
fig3_categories = ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
fig4_categories = ['Voltage', 'Global_intensity']

# Helper function to create and format each figure
def create_figure(categories, fig_title):
    fig, axes = plt.subplots(len(categories), figsize=(15, 6 * len(categories)))
    fig.suptitle(fig_title, fontsize=16)

    for i, category in enumerate(categories):
        ax = axes[i] if len(categories) > 1 else axes
        ax.plot(X['Date_Time'], X[category], label=category)
        ax.set_title(category)
        ax.set_xlabel('Time')
        ax.set_ylabel(category)
        ax.legend()

        # Set the number of x-axis ticks to 12 and rotate labels
        ax.xaxis.set_major_locator(MaxNLocator(13))
        ax.tick_params(axis='x', rotation=45)

        # Set the number of y-axis ticks to 5
        ax.yaxis.set_major_locator(MaxNLocator(5))

    # Adjust layout to prevent overlap
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

# Create each figure based on the planned categories
create_figure(fig1_categories, 'Figure 1: Global Active Power, Global Reactive Power, Power Factor')
create_figure(fig2_categories, 'Figure 2: Global Active Energy (GAE)')
create_figure(fig3_categories, 'Figure 3: Sub Meterings 1-3')
create_figure(fig4_categories, 'Figure 4: Voltage and Current (Global Intensity)')

In [None]:
# Create plots for each category
fig, axes = plt.subplots(8, figsize=(15, 20))
fig.suptitle('First 3 Days of Each Category vs Time [Reduced Dataset]', fontsize=16)

category = ['GAE', 'Global_active_power', 'Global_reactive_power','Power_factor', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']


# Plot each category
for i, category in enumerate(category):
    ax = axes[i]
    ax.plot(X['Date_Time'], X[category], label=category)
    ax.set_title(category)
    ax.set_xlabel('Time')
    ax.set_ylabel(category)
    ax.legend()

    # Limit the number of x-axis ticks to avoid clutter
    ax.xaxis.set_major_locator(plt.MaxNLocator(5))  # Adjust the number as needed
    ax.yaxis.set_major_locator(plt.MaxNLocator(5))

# Adjust layout to prevent overlap
plt.tight_layout(rect=[0, 0, 1, 0.97])

# Display the plot
plt.show()

In [20]:
from datetime import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dropout
from tensorflow.keras.metrics import MeanAbsoluteError

from sklearn.metrics import r2_score as sklearn_r2_score
from sklearn.preprocessing import StandardScaler

def r2_score(y_true, y_pred):
    # Cast y_true to float32 to ensure type consistency
    y_true = K.cast(y_true, dtype='float32')
    ss_res = K.sum(K.square(y_true - y_pred))
    ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return 1 - ss_res / (ss_tot + K.epsilon())

# Preprocessing function for Date_Time
def preprocess_datetime(data):
    """
    Converts 'Date_Time' strings to Unix timestamps.
    Adjust the format string to match your datetime format.
    """
    return np.array([
        datetime.strptime(dt, '%d/%m/%Y %H:%M:%S').timestamp() for dt in data
    ])

# Split the data
def train_test_split(data, categories: list, predictors:str, split=0.8):
    """
    Splits the given data into train and test sets.
    """
    split_int = int(len(data) * split)
    X_train, X_test = np.array(data[categories][:split_int]), np.array(data[categories][split_int:])
    y_train, y_test = np.array(data[predictors][:split_int]), np.array(data[predictors][split_int:])
    return X_train, X_test, y_train, y_test

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, ['Voltage', 'Global_intensity', 'Power_factor'],
                                                    ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'], 0.8)

# Scale the data
scaler = StandardScaler()
X_train_processed = scaler.fit_transform(X_train)
X_test_processed = scaler.transform(X_test)

# Generator for windowed data
def create_windowed_batches(X_data, y_data, window_size, batch_size, stride=1):
    """
    Generator to create batches of windowed data.
    """
    total_windows = (len(X_data) - window_size) // stride
    while True:
        for i in range(0, total_windows, batch_size):
            X_batch, y_batch = [], []
            for j in range(i, min(i + batch_size, total_windows)):
                start = j * stride
                X_batch.append(X_data[start:start + window_size])
                y_batch.append(y_data[start:start + window_size])
            yield np.array(X_batch), np.array(y_batch)

# Define parameters
window_size = 256
batch_size = 8192
stride = 1

# Create training and validation generators
train_gen = create_windowed_batches(X_train_processed, y_train, window_size, batch_size, stride)
val_gen = create_windowed_batches(X_test_processed, y_test, window_size, batch_size, stride)

# Calculate the number of steps per epoch
train_steps_per_epoch = (len(X_train_processed) - window_size) // stride // batch_size
val_steps_per_epoch = (len(X_test_processed) - window_size) // stride // batch_size

input_shape = (window_size, X_train_processed.shape[1])

#Define LSTM model
# model = Sequential([
#     LSTM(128, input_shape=input_shape, return_sequences=True),
#     Dropout(0.1),
#     Dense(32, activation='relu'),
#     Dense(3)
# ])

model = Sequential([
  LSTM(128, input_shape=input_shape, return_sequences=True),
  Dropout(0.1),
  GRU(128, return_sequences=True),
  Dropout(0.1),
  Dense(32, activation='relu'),
  Dense(3)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=[MeanAbsoluteError(), r2_score])

# Train the model
history = model.fit(
    train_gen,
    steps_per_epoch=train_steps_per_epoch,
    epochs=25,
    validation_data=val_gen,
    validation_steps=val_steps_per_epoch,
    verbose=1
)

# Evaluate the model on the test set
def create_windows(X_data, y_data, window_size, stride=1):
    """
    Create fixed-size windows for evaluation.
    """
    X, y = [], []
    for i in range(0, len(X_data) - window_size, stride):
        X.append(X_data[i:i + window_size])
        y.append(y_data[i:i + window_size])
    return np.array(X), np.array(y)

X_test_windowed, y_test_windowed = create_windows(X_test_processed, y_test, window_size, stride=1)
test_loss, test_mae, test_r2 = model.evaluate(X_test_windowed, y_test_windowed, verbose=1)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}, Test R2: {test_r2}")

# Plot training & validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot MAE
plt.plot(history.history['mean_absolute_error'], label='Training MAE')
plt.plot(history.history['val_mean_absolute_error'], label='Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Error')
plt.legend()
plt.show()

# Plot R2
plt.plot(history.history['r2'], label='Training MAE')
plt.plot(history.history['val_mean_absolute_error'], label='Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Error')
plt.legend()
plt.show()

Epoch 1/25


  super().__init__(**kwargs)


[1m 41/202[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m1:49[0m 679ms/step - loss: 63.0308 - mean_absolute_error: 3.2794 - r2_score: -0.0127

KeyboardInterrupt: 