Skip to content

Commit

Permalink
Implemented proportion_of_energy_submetered(). #31. Also:
Browse files Browse the repository at this point in the history
* stub for `average_energy()`
* stub for `average_energy_per_appliance()`
* use consts for DEFAULT_MAX_DROPOUT_RATE and
  DEFAULT_ON_POWER_THRESHOLD
* changed names to get_sample_period and get_dropout_rate; and both
  of these functions can now take either a DataFrame or Series or
  DatetimeIndex
* added `periods_with_sufficient_samples()`
* added `_get_index(data)`
  • Loading branch information
JackKelly committed Dec 20, 2013
1 parent 62f91a3 commit b349746
Show file tree
Hide file tree
Showing 2 changed files with 187 additions and 21 deletions.
107 changes: 107 additions & 0 deletions nilmtk/stats/electricity/building.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Statistics for applying to an entire building"""

from __future__ import print_function, division
from single import DEFAULT_MAX_DROPOUT_RATE, usage_per_period
import numpy as np



def proportion_of_energy_submetered(electricity,
max_dropout_rate=DEFAULT_MAX_DROPOUT_RATE,
require_matched_measurements=True):
"""Reports the proportion of energy in a building that is submetered.
Parameters
----------
electricity : nilmtk.sensors.electricity.Electricity
max_dropout_rate : float [0,1], optional
require_matched_measurements : boolean, optional, default=True
If True then raise an exception if there is not at least one shared
Measurement (e.g. ('power', 'active')) across all channels.
If False then continue even if measurements do not match.
Returns
-------
float
0 = no energy submetered
1 = all energy submetered
>1 = more energy submetered than is recorded on the mains channels!
"""

# TODO: Handle circuits.
# TODO: Check if all channels share at least one Measurement (e.g. ('power', 'active'))
# and handle `require_matched_measurements`
# TODO: handle dataframes with more than one column (don't use df.icol(0))

# for each channel, find set of 'good_days' where dropout_rate < max_dropout_rate
good_days_list = []

def get_kwh_per_day_per_chan(dictionary):
"""Helper function. Returns a list of pd.Series of kWh per day."""
chan_kwh_per_day = []
for label, df in dictionary.iteritems():
kwh_per_day = usage_per_period(df.icol(0), freq='D',
max_dropout_rate=max_dropout_rate)['kwh']
kwh_per_day = kwh_per_day.dropna()
chan_kwh_per_day.append(kwh_per_day)
good_days_list.append(set(kwh_per_day.index))
return chan_kwh_per_day

mains_kwh_per_day = get_kwh_per_day_per_chan(electricity.mains)
appliances_kwh_per_day = get_kwh_per_day_per_chan(electricity.appliances)

# find intersection of all these sets (i.e. find all good days in common)
good_days_set = good_days_list[0]
for good_days in good_days_list[1:]:
good_days_set = good_days_set.intersection(good_days)

# for each day in intersection, get kWh
proportion_per_day = []
for good_day in good_days_set:
mains_kwh = 0
for kwh_per_day in mains_kwh_per_day:
mains_kwh += kwh_per_day[good_day]

appliances_kwh = 0
for kwh_per_day in appliances_kwh_per_day:
appliances_kwh += kwh_per_day[good_day]

proportion = appliances_kwh / mains_kwh
proportion_per_day.append(proportion)

return np.mean(proportion_per_day)


def average_energy(electricity,
max_dropout_rate=DEFAULT_MAX_DROPOUT_RATE):
"""
Returns
-------
float
Average energy usage for this building in kWh per day.
"""
raise NotImplementedError


def average_energy_per_appliance(electricity,
max_dropout_rate=DEFAULT_MAX_DROPOUT_RATE):
"""Reports the average energy consumed by each appliance.
For each appliance, we ignore any days which have a dropout rate
above `max_dropout_rate`.
Parameters
----------
electricity : nilmtk.sensors.electricity.Electricity
Returns
-------
av_energy : pd.Series
Each element of the index is an ApplianceName
Values are average energy in kWh per day
"""
raise NotImplementedError

101 changes: 80 additions & 21 deletions nilmtk/stats/electricity/single.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from matplotlib.dates import SEC_PER_HOUR
import copy

def sample_period(data):
DEFAULT_MAX_DROPOUT_RATE = 0.4 # [0,1]
DEFAULT_ON_POWER_THRESHOLD = 5 # watts


def get_sample_period(data):
"""Estimate the sample period in seconds.
Find the sample period by finding the stats.mode of the
Expand All @@ -25,25 +29,23 @@ def sample_period(data):
period : float
Sample period in seconds.
"""
if isinstance(data, (pd.DataFrame, pd.Series)):
index = data.index
elif isinstance(data, pd.DatetimeIndex):
index = data
else:
raise TypeError('wrote type for `data`.')

index = _get_index(data)
fwd_diff = np.diff(index.values[:100]).astype(np.float)
mode_fwd_diff = stats.mode(fwd_diff)[0][0]
period = mode_fwd_diff / 1E9
return period


def dropout_rate(df):
def get_dropout_rate(data, sample_period=None):
"""The proportion of samples that have been lost.
Parameters
----------
df : pandas.DataFrame
data : pandas.DataFrame or Series or DatetimeIndex
sample_period : int or float, optional
Sample period in seconds. If not provided then will
calculate it.
Returns
-------
Expand All @@ -52,12 +54,17 @@ def dropout_rate(df):
1 means that all samples have been lost and
0 means that no samples have been lost.
"""
duration = df.index[-1] - df.index[0]
n_expected_samples = duration.total_seconds() / sample_period(df)
return 1 - (df.index.size / n_expected_samples)
if sample_period is None:
sample_period = get_sample_period(data)

index = _get_index(data)
duration = index[-1] - index[0]
n_expected_samples = duration.total_seconds() / sample_period
return 1 - (index.size / n_expected_samples)


def hours_on(series, on_power_threshold=5, max_sample_period=None):
def hours_on(series, on_power_threshold=DEFAULT_ON_POWER_THRESHOLD,
max_sample_period=None):
"""Returns a float representing the number of hours this channel
has been above threshold.
Expand Down Expand Up @@ -148,8 +155,10 @@ def energy(series, max_sample_period=None, unit='kwh'):
return _energy


def usage_per_period(series, freq, tz_convert=None, on_power_threshold=5,
max_dropout_rate=0.4, verbose=False,
def usage_per_period(series, freq,
on_power_threshold=DEFAULT_ON_POWER_THRESHOLD,
max_dropout_rate=DEFAULT_MAX_DROPOUT_RATE,
verbose=False,
energy_unit='kwh', max_sample_period=None):
"""Calculate the usage (hours on and kwh) per time period.
Expand Down Expand Up @@ -295,7 +304,7 @@ def usage_per_period(series, freq, tz_convert=None, on_power_threshold=5,
energy_series = pd.Series(index=period_range, dtype=np.float,
name=name+' '+energy_unit)

MAX_SAMPLES_PER_PERIOD = _secs_per_period_alias(freq) / sample_period(series)
MAX_SAMPLES_PER_PERIOD = _secs_per_period_alias(freq) / get_sample_period(series)
MIN_SAMPLES_PER_PERIOD = (MAX_SAMPLES_PER_PERIOD *
(1-max_dropout_rate))

Expand Down Expand Up @@ -332,8 +341,8 @@ def usage_per_period(series, freq, tz_convert=None, on_power_threshold=5,
energy_unit: energy_series})


def activity_distribution(series, on_power_threshold=5, bin_size='T',
timespan='D'):
def activity_distribution(series, on_power_threshold=DEFAULT_ON_POWER_THRESHOLD,
bin_size='T', timespan='D'):
"""Returns a distribution describing when this appliance was turned
on over repeating timespans. For example, if you want to see
which times of day this appliance was used, on average, then use
Expand Down Expand Up @@ -397,7 +406,8 @@ def activity_distribution(series, on_power_threshold=5, bin_size='T',
return distribution


def on(series, max_sample_period=None, on_power_threshold=5):
def on(series, max_sample_period=None,
on_power_threshold=DEFAULT_ON_POWER_THRESHOLD):
"""Returns pd.Series with Boolean values indicating whether the
appliance is on (True) or off (False). Adds an 'off' entry if data
is lost for more than self.max_sample_period.
Expand Down Expand Up @@ -540,6 +550,33 @@ def durations(on_series, on_or_off, ignore_n_off_samples=None,
return durations


def periods_with_sufficient_samples(datetime_index, freq,
max_dropout_rate=DEFAULT_MAX_DROPOUT_RATE,
use_local_time=True):
"""Find periods where the dropout rate is less than max_dropout_rate.
Returns
-------
set of Periods
"""
good_periods = set()
periods, boundaries = _indicies_of_periods(datetime_index, freq=freq,
use_local_time=use_local_time)
sample_period = get_sample_period(datetime_index)
for period in periods:
try:
start_i, end_i = boundaries[period]
except KeyError:
continue
index_for_period = datetime_index[start_i:end_i]
dropout_rate = get_dropout_rate(index_for_period, sample_period)
if dropout_rate < max_dropout_rate:
good_periods.add(period)

return good_periods



#------------------------ HELPER FUNCTIONS -------------------------

def _secs_per_period_alias(alias):
Expand Down Expand Up @@ -579,6 +616,7 @@ def _indicies_of_periods(datetime_index, freq, use_local_time=True):
Each key is a pd.tseries.period.Period
Each value is a tuple of ints:
(<start index into `datetime_index` for period>, <end index>)
Periods for which no data exists will not have a key.
Examples
--------
Expand Down Expand Up @@ -626,7 +664,7 @@ def _indicies_of_periods(datetime_index, freq, use_local_time=True):
# during the loop...

# Find the minimum sample period.
MIN_SAMPLE_PERIOD = int(sample_period(datetime_index))
MIN_SAMPLE_PERIOD = int(get_sample_period(datetime_index))
MAX_SAMPLES_PER_PERIOD = int(_secs_per_period_alias(freq) / MIN_SAMPLE_PERIOD)
MAX_SAMPLES_PER_2_PERIODS = MAX_SAMPLES_PER_PERIOD * 2
n_rows_processed = 0
Expand Down Expand Up @@ -681,3 +719,24 @@ def _tz_to_naive(datetime_index):

# Now convert to naive DatetimeIndex
return pd.DatetimeIndex(datetime_index.values + tz_offset_td64)


def _get_index(data):
"""
Parameters
----------
data : pandas.DataFrame or Series or DatetimeIndex
Returns
-------
index : the index for the DataFrame or Series
"""

if isinstance(data, (pd.DataFrame, pd.Series)):
index = data.index
elif isinstance(data, pd.DatetimeIndex):
index = data
else:
raise TypeError('wrote type for `data`.')
return index

0 comments on commit b349746

Please sign in to comment.