In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import datetime
import time
import math
import warnings
warnings.filterwarnings("ignore")
import glob

In [2]:
def print_heads_tails(df, h=True,t=True):
    print(f'Data has shape: {df.shape}')
    if h:
        display(df.head(2))

    print('...')

    if t:
        display(df.tail(2))

In [3]:
def read_labels():
    labels = {}
    for house in range(1, 6):
        file_path = f'data/ukdale/house_{house}/labels.dat'
        labels[house] = {}
        with open(file_path) as f:
            for line in f:
                split_line = line.split(' ')
                channel = int(split_line[0])
                device_name = split_line[1].strip()
                prefix = 'ft_'

                if device_name == 'aggregate':
                    prefix = ''
                    device_name = 'aggregate_apparent'

                labels[house][channel] = prefix + device_name
        labels[house]['mains'] = 'mains'
    return labels
file_labels = read_labels()
# for i in range(1,3):
#     print('House {}: '.format(i), labels[i], '\n')
for house in range(2,3):
    print(f'House {house}: file_labels[i]')

House 2: file_labels[i]


In [4]:
def get_house_path(house):
    return f'data/ukdale/house_{house}/'
    
def get_chan_path(house, channel):
    if channel == 'mains':
        return get_house_path(house) + f'mains.dat'

    return get_house_path(house) + f'channel_{channel}.dat'

def get_num_apps(house):
    return len(glob.glob(get_house_path(house) + 'channel_*[0-9].dat'))

In [5]:
def read_file(house, channel):
    print(f'reading house {house}; channel {channel}');
    file = get_chan_path(house, channel)
    
    df = pd.read_table(file, sep = ' ', names = ['unix_time', file_labels[house][channel]], 
                                       dtype = {'unix_time': 'int64', file_labels[house][channel]:'float64'}) 

    return df

def read_mains_file(house):
    
    file = get_chan_path(house, 'mains')
    
    df = pd.read_table(file, sep = ' ', names = ['unix_time', 'mains_active', 'mains_apparent', 'mains_rms' ], 
                                       dtype = {'unix_time': 'float64', 'mains_active':'float64',
                                                'mains_apparent': 'float64', 'mains_rms': 'float64'}) 

    return df

In [6]:
def parse_data(df, sort_index = True, drop_duplicates = True):
    df['timestamp'] = df['unix_time'].astype("datetime64[s]")
    df.set_index(df['timestamp'].values, inplace=True)
    df.drop(['unix_time'], axis=1, inplace=True)

#     tz = 'US/Eastern'
#     # Convert the integer index column to timezone-aware datetime 
#     df.index = pd.to_datetime(df.index.values, unit='s', utc=True)
#     df = df.tz_convert(tz)

    if sort_index:
        df = df.sort_index() # if using REDD: raw data might not be sorted
        
    if drop_duplicates:
        dups_in_index = df.index.duplicated(keep='first')
        if dups_in_index.any():
#             print('Data has duplicates', house, channel, dups_in_index[dups_in_index == True])
            df = df[~dups_in_index]

    return df

In [7]:
def get_timeframe(df):
    start = df.index[0]
    end = df.index[-1]
    
    return start, end

In [8]:
def get_feature_columns(df):
    return list(filter(lambda x: x.startswith('ft_'), df.columns.tolist()))

In [9]:
def sub_sample_mains(df):
    print('Before sub sampling')
    print_heads_tails(df)
    df = df.resample('6S').max()
    print('After sub sampling')
    print_heads_tails(df)
    
    return df

In [10]:
def fill_na(df, dry_run = False):
    for label in df.columns:
        null_count = df[label].isnull().sum()
        zero_count = df[label].isin([0]).sum()
        print(f'[fill_na] checked NaN count for {label}; result is {null_count}')
        print(f'[fill_na] checked zero count for {label}; result is {zero_count}')
        if not dry_run and null_count > 0:
            df[label].interpolate(method='linear', inplace=True)
            df[label].fillna(0.0, inplace=True)
            print(f'[fill_na] post filling - NaN count for {label}; result is {null_count}')
            print(f'[fill_na] post filling - checked zero count for {label}; result is {zero_count}')

In [11]:
DATA_SIZE_LIMIT = 1500000

In [12]:
def read_mains(house):
    df = read_mains_file(house)
    df = parse_data(df)
    
    return df

In [13]:
mains_df = {}
for house in range(2,3):
    mains_df[house] = read_mains(house)
    fill_na(mains_df[house], dry_run=True)

[fill_na] checked NaN count for mains_active; result is 0
[fill_na] checked zero count for mains_active; result is 38716
[fill_na] checked NaN count for mains_apparent; result is 0
[fill_na] checked zero count for mains_apparent; result is 0
[fill_na] checked NaN count for mains_rms; result is 0
[fill_na] checked zero count for mains_rms; result is 0
[fill_na] checked NaN count for timestamp; result is 0
[fill_na] checked zero count for timestamp; result is 0


In [14]:
for house in range(2,3):
    mains_df[house] = sub_sample_mains(mains_df[house])
    fill_na(mains_df[house], dry_run=True)

Before sub sampling
Data has shape: (12043072, 4)


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp
2013-04-16 20:45:16,0.0,421.53,237.38,2013-04-16 20:45:16
2013-04-16 20:45:17,0.0,421.77,237.71,2013-04-16 20:45:17


...


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp
2013-10-10 05:15:59,105.75,146.46,243.53,2013-10-10 05:15:59
2013-10-10 05:16:00,105.12,145.91,243.0,2013-10-10 05:16:00


After sub sampling
Data has shape: (2539509, 4)


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp
2013-04-16 20:45:12,0.0,421.77,237.71,2013-04-16 20:45:17
2013-04-16 20:45:18,0.0,421.67,237.9,2013-04-16 20:45:23


...


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp
2013-10-10 05:15:54,106.98,148.29,243.62,2013-10-10 05:15:59
2013-10-10 05:16:00,105.12,145.91,243.0,2013-10-10 05:16:00


[fill_na] checked NaN count for mains_active; result is 512793
[fill_na] checked zero count for mains_active; result is 6520
[fill_na] checked NaN count for mains_apparent; result is 512793
[fill_na] checked zero count for mains_apparent; result is 0
[fill_na] checked NaN count for mains_rms; result is 512793
[fill_na] checked zero count for mains_rms; result is 0
[fill_na] checked NaN count for timestamp; result is 512793
[fill_na] checked zero count for timestamp; result is 0


### Interpolate missing values and re-add timestamp

pandas interpolate drops non numerical columns, so we need to re-add the timestamp column

In [15]:
for house in range(2,3):
    fill_na(mains_df[house])
    mains_df[house]['timestamp'] = mains_df[house].index
    print('')
    fill_na(mains_df[house], dry_run=True)

[fill_na] checked NaN count for mains_active; result is 512793
[fill_na] checked zero count for mains_active; result is 6520
[fill_na] post filling - NaN count for mains_active; result is 512793
[fill_na] post filling - checked zero count for mains_active; result is 6520
[fill_na] checked NaN count for mains_apparent; result is 512793
[fill_na] checked zero count for mains_apparent; result is 0
[fill_na] post filling - NaN count for mains_apparent; result is 512793
[fill_na] post filling - checked zero count for mains_apparent; result is 0
[fill_na] checked NaN count for mains_rms; result is 512793
[fill_na] checked zero count for mains_rms; result is 0
[fill_na] post filling - NaN count for mains_rms; result is 512793
[fill_na] post filling - checked zero count for mains_rms; result is 0
[fill_na] checked NaN count for timestamp; result is 512793
[fill_na] checked zero count for timestamp; result is 0
[fill_na] post filling - NaN count for timestamp; result is 512793
[fill_na] post fi

In [16]:
def read_merge_data(house):
#     df = read_file(house, 1)
#     df = parse_data(df)

    df = mains_df[house]
    print(f'read house {house}; mains; df.shape is {df.shape}')

    num_apps = get_num_apps(house)
    for i in range(1, num_apps + 1):
        data = read_file(house, i)
        if data.shape[0] >= DATA_SIZE_LIMIT:
            print(f'read house {house}; channel {i}; df.shape is {df.shape}; data.shape is {data.shape}')
            data = parse_data(data)

            start_x, end_x = get_timeframe(data)
            start_y, end_y = get_timeframe(df)

            start = start_x if start_x > start_y else start_y
            end = end_x if end_x < end_y else end_y

            data = data[(data.index >= start) &
                                   (data.index <= end)]
            df = df[(df.index >= start) &
                                   (df.index <= end)]

            df = pd.merge_asof(df, data, on = 'timestamp', tolerance=pd.Timedelta('6s'))
            df.set_index(df['timestamp'].values, inplace=True)
        else:
            print(f'skipping house {house}; channel {i}; df.shape is {df.shape}; data.shape is {data.shape}')
#         df = pd.merge(df, data, how = 'inner', on = 'unix_time')

    return df

df = {}
for i in range(2,3):
    df[i] = read_merge_data(i)

read house 2; mains; df.shape is (2539509, 4)
reading house 2; channel 1
read house 2; channel 1; df.shape is (2539509, 4); data.shape is (2780373, 2)
reading house 2; channel 2
read house 2; channel 2; df.shape is (2539507, 5); data.shape is (2804685, 2)
reading house 2; channel 3
read house 2; channel 3; df.shape is (2539505, 6); data.shape is (2805646, 2)
reading house 2; channel 4
read house 2; channel 4; df.shape is (2539505, 7); data.shape is (2801065, 2)
reading house 2; channel 5
read house 2; channel 5; df.shape is (2539505, 8); data.shape is (2806036, 2)
reading house 2; channel 6
read house 2; channel 6; df.shape is (2539505, 9); data.shape is (2795349, 2)
reading house 2; channel 7
read house 2; channel 7; df.shape is (2539501, 10); data.shape is (2094586, 2)
reading house 2; channel 8
read house 2; channel 8; df.shape is (2539171, 11); data.shape is (2094523, 2)
reading house 2; channel 9
read house 2; channel 9; df.shape is (2539171, 12); data.shape is (2080995, 2)
readin

In [17]:
for house in range(2,3):
    fill_na(df[house])

[fill_na] checked NaN count for mains_active; result is 0
[fill_na] checked zero count for mains_active; result is 0
[fill_na] checked NaN count for mains_apparent; result is 0
[fill_na] checked zero count for mains_apparent; result is 0
[fill_na] checked NaN count for mains_rms; result is 0
[fill_na] checked zero count for mains_rms; result is 0
[fill_na] checked NaN count for timestamp; result is 0
[fill_na] checked zero count for timestamp; result is 0
[fill_na] checked NaN count for aggregate_apparent; result is 329100
[fill_na] checked zero count for aggregate_apparent; result is 0
[fill_na] post filling - NaN count for aggregate_apparent; result is 329100
[fill_na] post filling - checked zero count for aggregate_apparent; result is 0
[fill_na] checked NaN count for ft_laptop; result is 327433
[fill_na] checked zero count for ft_laptop; result is 1045579
[fill_na] post filling - NaN count for ft_laptop; result is 327433
[fill_na] post filling - checked zero count for ft_laptop; re

In [18]:
for house in range(2,3):
    print_heads_tails(df[house])

Data has shape: (2049466, 23)


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp,aggregate_apparent,ft_laptop,ft_monitor,ft_speakers,ft_server,ft_router,...,ft_running_machine,ft_laptop2,ft_washing_machine,ft_dish_washer,ft_fridge,ft_microwave,ft_toaster,ft_playstation,ft_modem,ft_cooker
2013-05-20 21:28:42,239.6,289.24,243.82,2013-05-20 21:28:42,252.0,18.0,61.0,11.0,13.0,7.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-05-20 21:28:48,234.51,281.16,243.97,2013-05-20 21:28:48,254.0,18.0,61.0,11.0,13.0,6.0,...,1.0,0.0,3.0,1.0,11.0,0.0,0.0,1.0,9.0,0.0


...


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp,aggregate_apparent,ft_laptop,ft_monitor,ft_speakers,ft_server,ft_router,...,ft_running_machine,ft_laptop2,ft_washing_machine,ft_dish_washer,ft_fridge,ft_microwave,ft_toaster,ft_playstation,ft_modem,ft_cooker
2013-10-10 05:15:06,106.01,146.1,243.7,2013-10-10 05:15:06,131.0,0.0,0.0,3.0,14.0,6.0,...,1.0,0.0,3.0,1.0,11.0,0.0,0.0,1.0,9.0,0.0
2013-10-10 05:15:12,106.73,147.67,243.68,2013-10-10 05:15:12,130.0,0.0,0.0,3.0,15.0,6.0,...,1.0,0.0,4.0,1.0,11.0,0.0,0.0,1.0,9.0,0.0


### Convert apparent to real

In [19]:
power_factors = {}
apparent_labels = {
    1: ['aggregate_apparent', 'ft_boiler', 'ft_solar_thermal_pump', 'ft_kitchen_lights', 'ft_lighting_circuit'],
    2: ['aggregate_apparent']
}
for house in range(2,3):
    mains_active_sample = df[house]['mains_active'].iloc[0]
    mains_apparent_sample = df[house]['mains_apparent'].iloc[0]
    power_factors[house] = mains_active_sample / mains_apparent_sample
    for apparent_label in apparent_labels[house]:
        if apparent_label in df[house].columns:
            print(f'Converting apparent to real for {apparent_label}')
            if apparent_label == 'aggregate_apparent':
                df[house]['aggregate_active'] = df[house][apparent_label] * power_factors[house]
            else:
                df[house][apparent_label] = df[house][apparent_label] * power_factors[house]
        else:
            print(f'Cannot convert apparent to real for {apparent_label}, since it is not present in df')

Converting apparent to real for aggregate_apparent


In [20]:
for house in range(2,3):
    print_heads_tails(df[house])

Data has shape: (2049466, 24)


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp,aggregate_apparent,ft_laptop,ft_monitor,ft_speakers,ft_server,ft_router,...,ft_laptop2,ft_washing_machine,ft_dish_washer,ft_fridge,ft_microwave,ft_toaster,ft_playstation,ft_modem,ft_cooker,aggregate_active
2013-05-20 21:28:42,239.6,289.24,243.82,2013-05-20 21:28:42,252.0,18.0,61.0,11.0,13.0,7.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,208.75121
2013-05-20 21:28:48,234.51,281.16,243.97,2013-05-20 21:28:48,254.0,18.0,61.0,11.0,13.0,6.0,...,0.0,3.0,1.0,11.0,0.0,0.0,1.0,9.0,0.0,210.407966


...


Unnamed: 0,mains_active,mains_apparent,mains_rms,timestamp,aggregate_apparent,ft_laptop,ft_monitor,ft_speakers,ft_server,ft_router,...,ft_laptop2,ft_washing_machine,ft_dish_washer,ft_fridge,ft_microwave,ft_toaster,ft_playstation,ft_modem,ft_cooker,aggregate_active
2013-10-10 05:15:06,106.01,146.1,243.7,2013-10-10 05:15:06,131.0,0.0,0.0,3.0,14.0,6.0,...,0.0,3.0,1.0,11.0,0.0,0.0,1.0,9.0,0.0,108.517494
2013-10-10 05:15:12,106.73,147.67,243.68,2013-10-10 05:15:12,130.0,0.0,0.0,3.0,15.0,6.0,...,0.0,4.0,1.0,11.0,0.0,0.0,1.0,9.0,0.0,107.689116


### Drop Low Power Devices

In [21]:
averages = {}
def get_devices_mean_power(df):
    for house in range(2,3):
        averages[house] = {}
        num_apps = get_num_apps(i)
        for label in get_feature_columns(df[house]):
            mean = df[house][label].mean()
            averages[house][label] = mean

get_devices_mean_power(df)

In [22]:
for house in range(2,3):
    sorted_averages = sorted(averages[house].items(), key=lambda x: x[1])
    for chan, avg in reversed(sorted_averages):
        print(chan, (25 - len(chan)) * " ",  avg)

ft_fridge                  46.967158762331266
ft_dish_washer             35.63921382447916
ft_kettle                  32.55037653710772
ft_monitor                 24.961451665946154
ft_server                  13.82414760723037
ft_laptop                  10.693298888588542
ft_washing_machine         10.22656413914649
ft_modem                   8.989940062435776
ft_router                  6.17826570433469
ft_speakers                6.093178174217089
ft_microwave               5.541322471316919
ft_laptop2                 3.4968662568688624
ft_rice_cooker             2.92283331365341
ft_running_machine         2.727125748853604
ft_server_hdd              1.2615361757648087
ft_playstation             0.9531202274153365
ft_toaster                 0.6479085283678774
ft_cooker                  0.17668968404452673


In [23]:
def drop_low_power_devices(df, labels, averages):
    print('drop_low_power_devices :: initial shape', df.shape)
    for label in labels:
        mean = averages[label]

        if mean < 1.0:
            print(f'dropping low power feature - {label} with mean - {mean} for house - {house} at chan - {chan}')
            df.drop([label], axis=1, inplace=True)

    print('drop_low_power_devices :: final shape', df.shape)

In [24]:
# def label_pattern_checkpoins(df, label):
#     window_start = None
#     current = 0
#     new_values = []
#     for value in df[label].values:
#         if window_start = None
#             window_start = 
        
        

### Re-add unix time

It's easier to parse the unix_time than datetime from csv, so we want to re-add this before dumping to csv

In [25]:
def re_add_unix_time_feature(df, inplace=True):
    print('re-adding unix timestamp')    
    df['unix_time'] = df.timestamp.map(pd.Timestamp.timestamp)
    df.set_index(df['unix_time'].values, inplace=True)
    df.drop(['timestamp'], axis=1, inplace=True)

### Dump Data into new CSVs

In [26]:
import pathlib

newChunkDataDir = 'data/ukdale-parsed-chunks';
pathlib.Path(newChunkDataDir).mkdir(parents=True, exist_ok=True) 

for house in range(2,3):
    houseDir = f"{newChunkDataDir}/house_{house}"
    pathlib.Path(houseDir).mkdir(parents=True, exist_ok=True) 

In [27]:
def df_chunk_to_csv(house, chunk, df):
    file_name = f"{newChunkDataDir}/house_{house}/chunk_{chunk}.dat"
    df.to_csv(file_name, sep='\t', header=True, index=False)

In [28]:
import math

def save_by_chunks(df):
    file_row_size = 4500000
    for house in range(2,3):
        df_row_size = df[house].shape[0]
        no_of_chunks = math.ceil(df_row_size / file_row_size)
        print(f'Total row size - {df_row_size}; Number of chunks - {no_of_chunks}')
        drop_low_power_devices(df[house], get_feature_columns(df[house]), averages[house])
        re_add_unix_time_feature(df[house], inplace=False)
        for n in range(no_of_chunks):
            start_index = n * file_row_size
            end_index = start_index + file_row_size
            end_index = end_index if end_index <= df_row_size else df_row_size

            split_df = df[house].iloc[start_index:end_index]
            print(f'saving chunk - {n+1}, with row range - {start_index} to {end_index} and shape - {split_df.shape}')
            
            df_chunk_to_csv(house, n+1, split_df)

In [29]:
save_by_chunks(df)

Total row size - 2049466; Number of chunks - 1
drop_low_power_devices :: initial shape (2049466, 24)
dropping low power feature - ft_toaster with mean - 0.6479085283678774 for house - 2 at chan - ft_cooker
dropping low power feature - ft_playstation with mean - 0.9531202274153365 for house - 2 at chan - ft_cooker
dropping low power feature - ft_cooker with mean - 0.17668968404452673 for house - 2 at chan - ft_cooker
drop_low_power_devices :: final shape (2049466, 21)
re-adding unix timestamp
saving chunk - 1, with row range - 0 to 2049466 and shape - (2049466, 21)
