In [1]:
%load_ext autoreload
%autoreload 1
%aimport cooking_session
%run cooking_session.py

In [30]:
import pandas as pd
import numpy as np

In [32]:
df_raw = pd.read_csv('dataframe_raw_jan14.csv', sep=',')
min_size_of_spikes=0.0
df_spikes = df_raw.loc[(df_raw.energy > df_raw.energy.shift(-1) + min_size_of_spikes) &
                   (df_raw.meter_number == df_raw.meter_number.shift(-1))]
print('The data has this many spikes: ' + str(df_spikes.timestamp.count()))

The data has this many spikes: 234


In [26]:
# Source file
time_resolution = 5

df_raw = pd.read_csv('dataframe_raw_jan14.csv', sep=',' , index_col=False)

df_raw = resolve_spikes(df_raw)
params = {'time_resolution': 5,
         't_between': 15,
         'error_margin': 0.04,
         'min_size_of_spikes': 1,
         'min_cooking_event': 0.05,
         'power_mean_min': 0.05,
         'event_min_current': 0,
         'time_resolution': 5,
         'min_active_load': 0.15,
         'power_capacity': 1}

df_processed = extract_cooking_events(df_raw, **params)

df_epc = df_processed.copy()
df_epc.reset_index(inplace=True)

# Make an intermediate Cooking Event Count
df_epc['event_count'] = 0
df_epc.loc[(df_epc.cooking_event.diff()
            != 0), 'event_count'] += 1
df_epc.event_count = df_epc['event_count'].cumsum()
df_epc.loc[
    (df_epc.cooking_event.isnull()),
    'event_count'] = np.nan

# Check start of events
start_of_event = df_epc.copy()
start_of_event = start_of_event.groupby(
    ['meter_number', 'event_count']).head(1)
start_of_event.loc[
    ((start_of_event['energy'] -
      error_margin <= start_of_event['energy'].shift()) & (
        start_of_event.event_count.isnull() == False) & (
        start_of_event.meter_number == start_of_event.meter_number.shift())),
    'timestamp_issue'] = True

# Make an indication of timestamp issue at start of cooking event.
df_epc['timestamp_issue'] = df_epc.event_count.map(
    start_of_event.set_index('event_count')['timestamp_issue'].to_dict())

# Check end of events
end_of_event = df_epc.copy()
end_of_event = end_of_event.groupby(
    ['meter_number', 'event_count']).tail(1)
end_of_event.loc[
    ((end_of_event['energy'] -
      error_margin <= end_of_event['energy'].shift()) & (
        end_of_event.event_count.isnull() == False) & (
        end_of_event.meter_number == end_of_event.meter_number.shift())),
    'timestamp_issue'] = True

# Make an indication of timestamp issue at end of cooking event.
df_epc['timestamp_issue'] = df_epc.event_count.map(
    end_of_event.set_index('event_count')['timestamp_issue'].to_dict())

# Discard recordings that are part of duplicated cooking events,
# only leaving the cooking event's first occurance.
df_timestamp_issue = df_epc.loc[(df_epc['timestamp_issue'] == True)]
#df_epc.drop(df_epc[(df_epc['timestamp_issue'] == 1)].index, inplace=True)

# Update the cooking event count
df_epc['cooking_event'] = 0
df_epc.loc[((df_epc.event_count.diff() != 0) & (
    df_epc.event_count.isnull() == False)), 'cooking_event'] += 1
df_epc.cooking_event = df_epc['cooking_event'].cumsum()

# Set timestamp in index to facilitate plotting with timeseries on the
# x-axis.
df_epc.set_index('timestamp', inplace=True)
#df_timestamp_issue = df_epc.loc[(df_epc['timestamp_issue'] == True)]
# Drop rows that do no longer have any function
#df_epc.drop(['event_count', 'timestamp_issue'], axis=1, inplace=True)
df_timestamp_issue.reset_index(inplace=True)
print('The data has this many timestamp issues: ' + str(df_timestamp_issue.timestamp.count()))

The data has this many timestamp issues: 358


In [27]:
# Source file
df_raw = pd.read_csv('dataframe_raw_jan14.csv', sep=',' , index_col=False)

df_epc = extract_cooking_events(df_raw, time_resolution=5)
df_epc.reset_index(inplace=True)
df_epc.loc[
            (
                (df_epc.energy.shift(-1) - df_epc.energy > 0.0)
                & (df_epc.meter_number == df_epc.meter_number.shift(-1))
                  & (df_epc.cooking_event != df_epc.cooking_event.shift(-1))
            ), 'energy_gap_to_next'] = df_epc.energy.shift(-1) - df_epc.energy
df_epc.set_index('timestamp', inplace=True)
print('The data has this much missing energy consumption [kWh] before adding start and end of cooking events: ' + str(df_epc.energy_gap_to_next.sum()))

The data has this much missing energy consumption [kWh] before adding start and end of cooking events: 1275.46


In [28]:
# Source file
df_raw = pd.read_csv('dataframe_raw_jan14.csv', sep=',' , index_col=False)

# Outputs
df_epc = preprocess_epc_data(df_raw, time_resolution=5)
df_only_events = create_only_event_df(df_epc)
df_epc = clean_data_set(df_epc)

# Create a column of the energy gaps that still exist after cleaning the data
df_epc.reset_index(inplace=True)
df_epc.loc[
            (
                (df_epc.energy.shift(-1) - df_epc.energy > 0.0)
                & (df_epc.meter_number == df_epc.meter_number.shift(-1))
                  & (df_epc.event_count != df_epc.event_count.shift(-1))
            ), 'energy_gap_to_next'] = df_epc.energy.shift(-1) - df_epc.energy
df_epc.set_index('timestamp', inplace=True)
print('The data has this much missing energy consumption [kWh] after adding start and end of cooking events: ' + str(df_epc.energy_gap_to_next.sum()))

The data has this much missing energy consumption [kWh] after adding start and end of cooking events: 923.0966666666665


#### Energy gaps to next = kWh between two measurements

In [38]:
# Create a column of the energy gaps that still exist after cleaning the data
df_epc.reset_index(inplace=True)

df_gaps = df_epc.copy()
df_gaps['energy_lost'] = df_gaps['energy_gap_to_next']
df_gaps.loc[(df_gaps['timestamp']-df_gaps['timestamp'].shift() > pd.to_timedelta(1, unit='m'))&\
        (df_gaps['meter_number'] == df_gaps['meter_number'].shift()), 'time_gap2'] = df_gaps['timestamp']-df_gaps['timestamp'].shift()
df_gaps['hour_gap'] = df_gaps['time_gap2']/np.timedelta64(1, 'h')
df_gaps = df_gaps.groupby(pd.cut(df_gaps['energy_gap_to_next'],
                    [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1000])).agg({'energy_lost':'sum', 'timezone_region':'count', 'hour_gap' : 'sum', 'meter_number' :'nunique'})
df_epc.set_index('timestamp', inplace=True)

df_gaps.rename({'timezone_region':'count'},axis=1,inplace=True)
df_gaps

Unnamed: 0_level_0,energy_lost,count,hour_gap,meter_number
energy_gap_to_next,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0.1, 0.2]",26.166667,198,3863.666667,54
"(0.2, 0.3]",15.083333,62,629.0,31
"(0.3, 0.4]",12.493333,36,102.583333,18
"(0.4, 0.5]",10.58,24,40.333333,19
"(0.5, 0.6]",7.8,14,34.0,12
"(0.6, 0.7]",3.81,6,5.833333,6
"(0.7, 0.8]",2.906667,4,12.666667,3
"(0.8, 0.9]",1.7,2,0.166667,2
"(0.9, 1.0]",3.75,4,0.333333,2
"(1.0, 1000.0]",448.036667,65,605.5,32


#### Largest energy gaps

In [32]:
df_epc.reset_index(inplace=True)
df_epc.set_index('meter_number', inplace=True)
print(df_epc.energy_gap_to_next.nlargest(60))
df_epc.reset_index(inplace=True)
df_epc.set_index('timestamp', inplace=True)

meter_number
546296    31.600000
546296    31.520000
546354    28.460000
546347    27.580000
546307    25.676667
546321    20.150000
546297    19.326667
546356    18.470000
546338    14.060000
546300    13.580000
546359    13.530000
546296    11.236667
546376    10.686667
546353    10.590000
546344    10.560000
546344     9.580000
546296     8.090000
546334     7.840000
546321     7.610000
546374     7.370000
546341     7.120000
546353     6.736667
546335     5.060000
546375     5.050000
546353     4.950000
546307     4.860000
546375     4.686667
546321     4.650000
546321     4.506667
546283     4.500000
546366     4.290000
546377     3.996667
546322     3.740000
546332     3.730000
546290     3.020000
546376     3.016667
546377     3.006667
546299     2.740000
546307     2.546667
546323     2.186667
546347     2.150000
546312     2.130000
546375     2.060000
546375     1.790000
546375     1.690000
546333     1.676667
546375     1.520000
546375     1.500000
546307     1.476667
546291 