Refactored some of Nipun's awesome stats functions. #160

nilmtk · Dec 2, 2014 · ecfc46e · ecfc46e
1 parent 16efe53
commit ecfc46e
Show file tree

Hide file tree

Showing 5 changed files with 234 additions and 216 deletions.
diff --git a/nilmtk/elecmeter.py b/nilmtk/elecmeter.py
@@ -7,12 +7,6 @@
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
-import scipy.spatial as ss
-from scipy import fft
-from pandas.tools.plotting import lag_plot, autocorrelation_plot
-from scipy.special import digamma,gamma
-from math import log,pi
-import numpy.random as nr
 import random
 from .preprocessing import Clip
 from .stats import TotalEnergy, GoodSections, DropoutRate
@@ -359,181 +353,7 @@ def power_series(self, **kwargs):
             series = chunk.icol(0).dropna()
             series.timeframe = getattr(chunk, 'timeframe', None)
             series.look_ahead = getattr(chunk, 'look_ahead', None)
-            yield series
-
-    def plot_lag(self, lag=1):
-        """
-        Plots a lag plot of power data
-        http://www.itl.nist.gov/div898/handbook/eda/section3/lagplot.htm
-
-        Returns
-        -------
-        matplotlib.axis
-        """
-        fig, ax = plt.subplots()
-        for power in self.power_series():
-            lag_plot(power, lag, ax = ax)
-        return ax
-
-    def plot_spectrum(self):
-        """
-        Plots spectral plot of power data
-        http://www.itl.nist.gov/div898/handbook/eda/section3/spectrum.htm
-
-        Code borrowed from:
-        http://glowingpython.blogspot.com/2011/08/how-to-plot-frequency-spectrum-with.html
-
-        Returns
-        -------
-        matplotlib.axis
-        """ 
-        fig, ax = plt.subplots()
-        Fs = 1.0/self.device.get('sample_period')
-        for power in self.power_series():
-            n = len(power.values) # length of the signal
-            k = np.arange(n)            
-            T = n/Fs
-            frq = k/T # two sides frequency range
-            frq = frq[range(n//2)] # one side frequency range
-
-            Y = fft(power)/n # fft computing and normalization
-            Y = Y[range(n//2)]
-
-            ax.plot(frq,abs(Y)) # plotting the spectrum
-
-        ax.set_xlabel('Freq (Hz)')
-        ax.set_ylabel('|Y(freq)|')
-        return ax
-
-
-    def plot_autocorrelation(self):
-        """
-        Plots autocorrelation of power data 
-        Reference: 
-        http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm
-
-        Returns
-        -------
-        matplotlib.axis 
-        """
-        fig, ax = plt.subplots()
-        for power in self.power_series():
-            autocorrelation_plot(power, ax = ax)
-        return ax
-
-
-    def switch_times(self, threshold=40):
-        """
-        Returns an array of pd.DateTime when a switch occurs as defined by threshold
-
-        Parameters
-        ----------
-        threshold: int, threshold in Watts between succcessive readings 
-        to amount for an appliance state change
-        """
-
-        datetime_switches = []
-        for power in self.power_series():
-            delta_power = power.diff()
-            delta_power_absolute = delta_power.abs()
-            datetime_switches.append(delta_power_absolute[(delta_power_absolute>threshold)].index.values.tolist())
-        return flatten(datetime_switches)
-
-    def entropy(self, k=3, base=2):
-        """ 
-        This implementation is provided courtesy NPEET toolbox,
-        the authors kindly allowed us to directly use their code.
-        As a courtesy procedure, you may wish to cite their paper, 
-        in case you use this function.
-        This fails if there is a large number of records. Need to
-        ask the authors what to do about the same! 
-        The classic K-L k-nearest neighbor continuous entropy estimator
-        x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
-        if x is a one-dimensional scalar and we have four samples
-        """
-        def kdtree_entropy(z):
-            assert k <= len(z)-1, "Set k smaller than num. samples - 1"
-            d = len(z[0])
-            N = len(z)
-            intens = 1e-10 #small noise to break degeneracy, see doc.
-            z = [list(p + intens*nr.rand(len(z[0]))) for p in z]
-            tree = ss.cKDTree(z)
-            nn = [tree.query(point,k+1,p=float('inf'))[0][k] for point in z]
-            const = digamma(N)-digamma(k) + d*log(2)
-            return (const + d*np.mean(map(log,nn)))/log(base)
-
-        out = []
-        for power in self.power_series():
-            x = power.values
-            num_elements = len(x)
-            x = x.reshape((num_elements, 1))            
-            if num_elements>MAX_SIZE_ENTROPY:
-
-                splits = num_elements/MAX_SIZE_ENTROPY + 1
-                y = np.array_split(x, splits)
-                for z in y:            
-                    out.append(kdtree_entropy(z))                    
-            else:
-                out.append(kdtree_entropy(x))
-        return sum(out)/len(out)
-
-    def mutual_information(self, other, k=3, base=2):
-        """ 
-        Mutual information of two ElecMeters
-        x,y should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
-        if x is a one-dimensional scalar and we have four samples
-
-        Parameters
-        ----------
-        other : ElecMeter or MeterGroup
-        """
-        def kdtree_mi(x, y, k, base):
-            intens = 1e-10 #small noise to break degeneracy, see doc.
-            x = [list(p + intens*nr.rand(len(x[0]))) for p in x]
-            y = [list(p + intens*nr.rand(len(y[0]))) for p in y]
-            points = zip2(x,y)
-            #Find nearest neighbors in joint space, p=inf means max-norm
-            tree = ss.cKDTree(points)
-            dvec = [tree.query(point,k+1,p=float('inf'))[0][k] for point in points]
-            a,b,c,d = avgdigamma(x,dvec), avgdigamma(y,dvec), digamma(k), digamma(len(x)) 
-            return (-a-b+c+d)/log(base)
-
-        def zip2(*args):
-            #zip2(x,y) takes the lists of vectors and makes it a list of vectors in a joint space
-            #E.g. zip2([[1],[2],[3]],[[4],[5],[6]]) = [[1,4],[2,5],[3,6]]
-            return [sum(sublist,[]) for sublist in zip(*args)]
-
-        def avgdigamma(points,dvec):
-            #This part finds number of neighbors in some radius in the marginal space
-            #returns expectation value of <psi(nx)>
-            N = len(points)
-            tree = ss.cKDTree(points)
-            avg = 0.
-            for i in range(N):
-                dist = dvec[i]
-                #subtlety, we don't include the boundary point, 
-                #but we are implicitly adding 1 to kraskov def bc center point is included
-                num_points = len(tree.query_ball_point(points[i],dist-1e-15,p=float('inf'))) 
-                avg += digamma(num_points)/N
-            return avg
-
-        out = []
-        for power_x, power_y in izip(self.power_series(), other.power_series()):
-            power_x_val = power_x.values
-            power_y_val = power_y.values 
-            num_elements = len(power_x_val)
-            power_x_val = power_x_val.reshape((num_elements, 1))
-            power_y_val = power_y_val.reshape((num_elements, 1))            
-            if num_elements>MAX_SIZE_ENTROPY:
-                splits = num_elements/MAX_SIZE_ENTROPY + 1
-                x_split = np.array_split(power_x_val, splits)
-                y_split = np.array_split(power_y_val, splits)
-                for x, y  in izip(x_split, y_split):            
-                    out.append(kdtree_mi(x, y, k, base))                    
-            else:
-                out.append(kdtree_mi(power_x_val, power_y_val, k, base))
-        return sum(out)/len(out)
-
+            yield series        
 
     def dry_run_metadata(self):
         return self.metadata

diff --git a/nilmtk/electric.py b/nilmtk/electric.py
@@ -2,10 +2,18 @@
 import numpy as np
 from collections import Counter
 from warnings import warn
+import scipy.spatial as ss
+from scipy import fft
+from pandas.tools.plotting import lag_plot, autocorrelation_plot
+from scipy.special import digamma,gamma
+from math import log,pi
+import numpy.random as nr
+
 from .timeframe import TimeFrame
 from .measurement import select_best_ac_type
 from nilmtk.utils import offset_alias_to_seconds
 
+
 class Electric(object):
     """Common implementations of methods shared by ElecMeter and MeterGroup.
     """
@@ -224,6 +232,178 @@ def correlation(self, other):
         corr = numerator*1.0/denominator
         return corr
 
+    def plot_lag(self, lag=1):
+        """
+        Plots a lag plot of power data
+        http://www.itl.nist.gov/div898/handbook/eda/section3/lagplot.htm
+
+        Returns
+        -------
+        matplotlib.axis
+        """
+        fig, ax = plt.subplots()
+        for power in self.power_series():
+            lag_plot(power, lag, ax = ax)
+        return ax
+
+    def plot_spectrum(self):
+        """
+        Plots spectral plot of power data
+        http://www.itl.nist.gov/div898/handbook/eda/section3/spectrum.htm
+
+        Code borrowed from:
+        http://glowingpython.blogspot.com/2011/08/how-to-plot-frequency-spectrum-with.html
+
+        Returns
+        -------
+        matplotlib.axis
+        """ 
+        fig, ax = plt.subplots()
+        Fs = 1.0/self.sample_period()
+        for power in self.power_series():
+            n = len(power.values) # length of the signal
+            k = np.arange(n)            
+            T = n/Fs
+            frq = k/T # two sides frequency range
+            frq = frq[range(n//2)] # one side frequency range
+
+            Y = fft(power)/n # fft computing and normalization
+            Y = Y[range(n//2)]
+
+            ax.plot(frq,abs(Y)) # plotting the spectrum
+
+        ax.set_xlabel('Freq (Hz)')
+        ax.set_ylabel('|Y(freq)|')
+        return ax
+
+    def plot_autocorrelation(self):
+        """
+        Plots autocorrelation of power data 
+        Reference: 
+        http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm
+
+        Returns
+        -------
+        matplotlib.axis 
+        """
+        fig, ax = plt.subplots()
+        for power in self.power_series():
+            autocorrelation_plot(power, ax = ax)
+        return ax
+
+
+    def switch_times(self, threshold=40):
+        """
+        Returns an array of pd.DateTime when a switch occurs as defined by threshold
+
+        Parameters
+        ----------
+        threshold: int, threshold in Watts between succcessive readings 
+        to amount for an appliance state change
+        """
+
+        datetime_switches = []
+        for power in self.power_series():
+            delta_power = power.diff()
+            delta_power_absolute = delta_power.abs()
+            datetime_switches.append(delta_power_absolute[(delta_power_absolute>threshold)].index.values.tolist())
+        return flatten(datetime_switches)
+
+    def entropy(self, k=3, base=2):
+        """ 
+        This implementation is provided courtesy NPEET toolbox,
+        the authors kindly allowed us to directly use their code.
+        As a courtesy procedure, you may wish to cite their paper, 
+        in case you use this function.
+        This fails if there is a large number of records. Need to
+        ask the authors what to do about the same! 
+        The classic K-L k-nearest neighbor continuous entropy estimator
+        x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
+        if x is a one-dimensional scalar and we have four samples
+        """
+        def kdtree_entropy(z):
+            assert k <= len(z)-1, "Set k smaller than num. samples - 1"
+            d = len(z[0])
+            N = len(z)
+            intens = 1e-10 #small noise to break degeneracy, see doc.
+            z = [list(p + intens*nr.rand(len(z[0]))) for p in z]
+            tree = ss.cKDTree(z)
+            nn = [tree.query(point,k+1,p=float('inf'))[0][k] for point in z]
+            const = digamma(N)-digamma(k) + d*log(2)
+            return (const + d*np.mean(map(log,nn)))/log(base)
+
+        out = []
+        for power in self.power_series():
+            x = power.values
+            num_elements = len(x)
+            x = x.reshape((num_elements, 1))            
+            if num_elements>MAX_SIZE_ENTROPY:
+
+                splits = num_elements/MAX_SIZE_ENTROPY + 1
+                y = np.array_split(x, splits)
+                for z in y:            
+                    out.append(kdtree_entropy(z))                    
+            else:
+                out.append(kdtree_entropy(x))
+        return sum(out)/len(out)
+
+    def mutual_information(self, other, k=3, base=2):
+        """ 
+        Mutual information of two ElecMeters
+        x,y should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
+        if x is a one-dimensional scalar and we have four samples
+
+        Parameters
+        ----------
+        other : ElecMeter or MeterGroup
+        """
+        def kdtree_mi(x, y, k, base):
+            intens = 1e-10 #small noise to break degeneracy, see doc.
+            x = [list(p + intens*nr.rand(len(x[0]))) for p in x]
+            y = [list(p + intens*nr.rand(len(y[0]))) for p in y]
+            points = zip2(x,y)
+            #Find nearest neighbors in joint space, p=inf means max-norm
+            tree = ss.cKDTree(points)
+            dvec = [tree.query(point,k+1,p=float('inf'))[0][k] for point in points]
+            a,b,c,d = avgdigamma(x,dvec), avgdigamma(y,dvec), digamma(k), digamma(len(x)) 
+            return (-a-b+c+d)/log(base)
+
+        def zip2(*args):
+            #zip2(x,y) takes the lists of vectors and makes it a list of vectors in a joint space
+            #E.g. zip2([[1],[2],[3]],[[4],[5],[6]]) = [[1,4],[2,5],[3,6]]
+            return [sum(sublist,[]) for sublist in zip(*args)]
+
+        def avgdigamma(points,dvec):
+            #This part finds number of neighbors in some radius in the marginal space
+            #returns expectation value of <psi(nx)>
+            N = len(points)
+            tree = ss.cKDTree(points)
+            avg = 0.
+            for i in range(N):
+                dist = dvec[i]
+                #subtlety, we don't include the boundary point, 
+                #but we are implicitly adding 1 to kraskov def bc center point is included
+                num_points = len(tree.query_ball_point(points[i],dist-1e-15,p=float('inf'))) 
+                avg += digamma(num_points)/N
+            return avg
+
+        out = []
+        for power_x, power_y in izip(self.power_series(), other.power_series()):
+            power_x_val = power_x.values
+            power_y_val = power_y.values 
+            num_elements = len(power_x_val)
+            power_x_val = power_x_val.reshape((num_elements, 1))
+            power_y_val = power_y_val.reshape((num_elements, 1))            
+            if num_elements>MAX_SIZE_ENTROPY:
+                splits = num_elements/MAX_SIZE_ENTROPY + 1
+                x_split = np.array_split(power_x_val, splits)
+                y_split = np.array_split(power_y_val, splits)
+                for x, y  in izip(x_split, y_split):            
+                    out.append(kdtree_mi(x, y, k, base))                    
+            else:
+                out.append(kdtree_mi(power_x_val, power_y_val, k, base))
+        return sum(out)/len(out)
+
 
   #   def activity_distribution(self):
   # * activity distribution: