Skip to content

Commit

Permalink
Refactored some of Nipun's awesome stats functions. #160
Browse files Browse the repository at this point in the history
  • Loading branch information
JackKelly committed Dec 2, 2014
1 parent 16efe53 commit ecfc46e
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 216 deletions.
182 changes: 1 addition & 181 deletions nilmtk/elecmeter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.spatial as ss
from scipy import fft
from pandas.tools.plotting import lag_plot, autocorrelation_plot
from scipy.special import digamma,gamma
from math import log,pi
import numpy.random as nr
import random
from .preprocessing import Clip
from .stats import TotalEnergy, GoodSections, DropoutRate
Expand Down Expand Up @@ -359,181 +353,7 @@ def power_series(self, **kwargs):
series = chunk.icol(0).dropna()
series.timeframe = getattr(chunk, 'timeframe', None)
series.look_ahead = getattr(chunk, 'look_ahead', None)
yield series

def plot_lag(self, lag=1):
"""
Plots a lag plot of power data
http://www.itl.nist.gov/div898/handbook/eda/section3/lagplot.htm
Returns
-------
matplotlib.axis
"""
fig, ax = plt.subplots()
for power in self.power_series():
lag_plot(power, lag, ax = ax)
return ax

def plot_spectrum(self):
"""
Plots spectral plot of power data
http://www.itl.nist.gov/div898/handbook/eda/section3/spectrum.htm
Code borrowed from:
http://glowingpython.blogspot.com/2011/08/how-to-plot-frequency-spectrum-with.html
Returns
-------
matplotlib.axis
"""
fig, ax = plt.subplots()
Fs = 1.0/self.device.get('sample_period')
for power in self.power_series():
n = len(power.values) # length of the signal
k = np.arange(n)
T = n/Fs
frq = k/T # two sides frequency range
frq = frq[range(n//2)] # one side frequency range

Y = fft(power)/n # fft computing and normalization
Y = Y[range(n//2)]

ax.plot(frq,abs(Y)) # plotting the spectrum

ax.set_xlabel('Freq (Hz)')
ax.set_ylabel('|Y(freq)|')
return ax


def plot_autocorrelation(self):
"""
Plots autocorrelation of power data
Reference:
http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm
Returns
-------
matplotlib.axis
"""
fig, ax = plt.subplots()
for power in self.power_series():
autocorrelation_plot(power, ax = ax)
return ax


def switch_times(self, threshold=40):
"""
Returns an array of pd.DateTime when a switch occurs as defined by threshold
Parameters
----------
threshold: int, threshold in Watts between succcessive readings
to amount for an appliance state change
"""

datetime_switches = []
for power in self.power_series():
delta_power = power.diff()
delta_power_absolute = delta_power.abs()
datetime_switches.append(delta_power_absolute[(delta_power_absolute>threshold)].index.values.tolist())
return flatten(datetime_switches)

def entropy(self, k=3, base=2):
"""
This implementation is provided courtesy NPEET toolbox,
the authors kindly allowed us to directly use their code.
As a courtesy procedure, you may wish to cite their paper,
in case you use this function.
This fails if there is a large number of records. Need to
ask the authors what to do about the same!
The classic K-L k-nearest neighbor continuous entropy estimator
x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
if x is a one-dimensional scalar and we have four samples
"""
def kdtree_entropy(z):
assert k <= len(z)-1, "Set k smaller than num. samples - 1"
d = len(z[0])
N = len(z)
intens = 1e-10 #small noise to break degeneracy, see doc.
z = [list(p + intens*nr.rand(len(z[0]))) for p in z]
tree = ss.cKDTree(z)
nn = [tree.query(point,k+1,p=float('inf'))[0][k] for point in z]
const = digamma(N)-digamma(k) + d*log(2)
return (const + d*np.mean(map(log,nn)))/log(base)

out = []
for power in self.power_series():
x = power.values
num_elements = len(x)
x = x.reshape((num_elements, 1))
if num_elements>MAX_SIZE_ENTROPY:

splits = num_elements/MAX_SIZE_ENTROPY + 1
y = np.array_split(x, splits)
for z in y:
out.append(kdtree_entropy(z))
else:
out.append(kdtree_entropy(x))
return sum(out)/len(out)

def mutual_information(self, other, k=3, base=2):
"""
Mutual information of two ElecMeters
x,y should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
if x is a one-dimensional scalar and we have four samples
Parameters
----------
other : ElecMeter or MeterGroup
"""
def kdtree_mi(x, y, k, base):
intens = 1e-10 #small noise to break degeneracy, see doc.
x = [list(p + intens*nr.rand(len(x[0]))) for p in x]
y = [list(p + intens*nr.rand(len(y[0]))) for p in y]
points = zip2(x,y)
#Find nearest neighbors in joint space, p=inf means max-norm
tree = ss.cKDTree(points)
dvec = [tree.query(point,k+1,p=float('inf'))[0][k] for point in points]
a,b,c,d = avgdigamma(x,dvec), avgdigamma(y,dvec), digamma(k), digamma(len(x))
return (-a-b+c+d)/log(base)

def zip2(*args):
#zip2(x,y) takes the lists of vectors and makes it a list of vectors in a joint space
#E.g. zip2([[1],[2],[3]],[[4],[5],[6]]) = [[1,4],[2,5],[3,6]]
return [sum(sublist,[]) for sublist in zip(*args)]

def avgdigamma(points,dvec):
#This part finds number of neighbors in some radius in the marginal space
#returns expectation value of <psi(nx)>
N = len(points)
tree = ss.cKDTree(points)
avg = 0.
for i in range(N):
dist = dvec[i]
#subtlety, we don't include the boundary point,
#but we are implicitly adding 1 to kraskov def bc center point is included
num_points = len(tree.query_ball_point(points[i],dist-1e-15,p=float('inf')))
avg += digamma(num_points)/N
return avg

out = []
for power_x, power_y in izip(self.power_series(), other.power_series()):
power_x_val = power_x.values
power_y_val = power_y.values
num_elements = len(power_x_val)
power_x_val = power_x_val.reshape((num_elements, 1))
power_y_val = power_y_val.reshape((num_elements, 1))
if num_elements>MAX_SIZE_ENTROPY:
splits = num_elements/MAX_SIZE_ENTROPY + 1
x_split = np.array_split(power_x_val, splits)
y_split = np.array_split(power_y_val, splits)
for x, y in izip(x_split, y_split):
out.append(kdtree_mi(x, y, k, base))
else:
out.append(kdtree_mi(power_x_val, power_y_val, k, base))
return sum(out)/len(out)

yield series

def dry_run_metadata(self):
return self.metadata
Expand Down
180 changes: 180 additions & 0 deletions nilmtk/electric.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@
import numpy as np
from collections import Counter
from warnings import warn
import scipy.spatial as ss
from scipy import fft
from pandas.tools.plotting import lag_plot, autocorrelation_plot
from scipy.special import digamma,gamma
from math import log,pi
import numpy.random as nr

from .timeframe import TimeFrame
from .measurement import select_best_ac_type
from nilmtk.utils import offset_alias_to_seconds


class Electric(object):
"""Common implementations of methods shared by ElecMeter and MeterGroup.
"""
Expand Down Expand Up @@ -224,6 +232,178 @@ def correlation(self, other):
corr = numerator*1.0/denominator
return corr

def plot_lag(self, lag=1):
"""
Plots a lag plot of power data
http://www.itl.nist.gov/div898/handbook/eda/section3/lagplot.htm
Returns
-------
matplotlib.axis
"""
fig, ax = plt.subplots()
for power in self.power_series():
lag_plot(power, lag, ax = ax)
return ax

def plot_spectrum(self):
"""
Plots spectral plot of power data
http://www.itl.nist.gov/div898/handbook/eda/section3/spectrum.htm
Code borrowed from:
http://glowingpython.blogspot.com/2011/08/how-to-plot-frequency-spectrum-with.html
Returns
-------
matplotlib.axis
"""
fig, ax = plt.subplots()
Fs = 1.0/self.sample_period()
for power in self.power_series():
n = len(power.values) # length of the signal
k = np.arange(n)
T = n/Fs
frq = k/T # two sides frequency range
frq = frq[range(n//2)] # one side frequency range

Y = fft(power)/n # fft computing and normalization
Y = Y[range(n//2)]

ax.plot(frq,abs(Y)) # plotting the spectrum

ax.set_xlabel('Freq (Hz)')
ax.set_ylabel('|Y(freq)|')
return ax

def plot_autocorrelation(self):
"""
Plots autocorrelation of power data
Reference:
http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm
Returns
-------
matplotlib.axis
"""
fig, ax = plt.subplots()
for power in self.power_series():
autocorrelation_plot(power, ax = ax)
return ax


def switch_times(self, threshold=40):
"""
Returns an array of pd.DateTime when a switch occurs as defined by threshold
Parameters
----------
threshold: int, threshold in Watts between succcessive readings
to amount for an appliance state change
"""

datetime_switches = []
for power in self.power_series():
delta_power = power.diff()
delta_power_absolute = delta_power.abs()
datetime_switches.append(delta_power_absolute[(delta_power_absolute>threshold)].index.values.tolist())
return flatten(datetime_switches)

def entropy(self, k=3, base=2):
"""
This implementation is provided courtesy NPEET toolbox,
the authors kindly allowed us to directly use their code.
As a courtesy procedure, you may wish to cite their paper,
in case you use this function.
This fails if there is a large number of records. Need to
ask the authors what to do about the same!
The classic K-L k-nearest neighbor continuous entropy estimator
x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
if x is a one-dimensional scalar and we have four samples
"""
def kdtree_entropy(z):
assert k <= len(z)-1, "Set k smaller than num. samples - 1"
d = len(z[0])
N = len(z)
intens = 1e-10 #small noise to break degeneracy, see doc.
z = [list(p + intens*nr.rand(len(z[0]))) for p in z]
tree = ss.cKDTree(z)
nn = [tree.query(point,k+1,p=float('inf'))[0][k] for point in z]
const = digamma(N)-digamma(k) + d*log(2)
return (const + d*np.mean(map(log,nn)))/log(base)

out = []
for power in self.power_series():
x = power.values
num_elements = len(x)
x = x.reshape((num_elements, 1))
if num_elements>MAX_SIZE_ENTROPY:

splits = num_elements/MAX_SIZE_ENTROPY + 1
y = np.array_split(x, splits)
for z in y:
out.append(kdtree_entropy(z))
else:
out.append(kdtree_entropy(x))
return sum(out)/len(out)

def mutual_information(self, other, k=3, base=2):
"""
Mutual information of two ElecMeters
x,y should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
if x is a one-dimensional scalar and we have four samples
Parameters
----------
other : ElecMeter or MeterGroup
"""
def kdtree_mi(x, y, k, base):
intens = 1e-10 #small noise to break degeneracy, see doc.
x = [list(p + intens*nr.rand(len(x[0]))) for p in x]
y = [list(p + intens*nr.rand(len(y[0]))) for p in y]
points = zip2(x,y)
#Find nearest neighbors in joint space, p=inf means max-norm
tree = ss.cKDTree(points)
dvec = [tree.query(point,k+1,p=float('inf'))[0][k] for point in points]
a,b,c,d = avgdigamma(x,dvec), avgdigamma(y,dvec), digamma(k), digamma(len(x))
return (-a-b+c+d)/log(base)

def zip2(*args):
#zip2(x,y) takes the lists of vectors and makes it a list of vectors in a joint space
#E.g. zip2([[1],[2],[3]],[[4],[5],[6]]) = [[1,4],[2,5],[3,6]]
return [sum(sublist,[]) for sublist in zip(*args)]

def avgdigamma(points,dvec):
#This part finds number of neighbors in some radius in the marginal space
#returns expectation value of <psi(nx)>
N = len(points)
tree = ss.cKDTree(points)
avg = 0.
for i in range(N):
dist = dvec[i]
#subtlety, we don't include the boundary point,
#but we are implicitly adding 1 to kraskov def bc center point is included
num_points = len(tree.query_ball_point(points[i],dist-1e-15,p=float('inf')))
avg += digamma(num_points)/N
return avg

out = []
for power_x, power_y in izip(self.power_series(), other.power_series()):
power_x_val = power_x.values
power_y_val = power_y.values
num_elements = len(power_x_val)
power_x_val = power_x_val.reshape((num_elements, 1))
power_y_val = power_y_val.reshape((num_elements, 1))
if num_elements>MAX_SIZE_ENTROPY:
splits = num_elements/MAX_SIZE_ENTROPY + 1
x_split = np.array_split(power_x_val, splits)
y_split = np.array_split(power_y_val, splits)
for x, y in izip(x_split, y_split):
out.append(kdtree_mi(x, y, k, base))
else:
out.append(kdtree_mi(power_x_val, power_y_val, k, base))
return sum(out)/len(out)


# def activity_distribution(self):
# * activity distribution:
Expand Down

0 comments on commit ecfc46e

Please sign in to comment.