Fixed bug for regression test matplotlib#1181 in scipy unit tests; ks…

…density is now referred to as gaussian_kde and exists as a class in mlab. Fixed list comp position bug and updated examples
neggert · May 24, 2014 · e6f1b38 · e6f1b38
1 parent 3c4c619
commit e6f1b38
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 122 deletions.
diff --git a/examples/statistics/violinplot_demo.py b/examples/statistics/violinplot_demo.py
@@ -8,7 +8,7 @@
 
 # fake data
 fs = 10 # fontsize
-pos = range(5)
+pos = [1,2,4,5,7,8]
 data = [np.random.normal(size=100) for i in pos]
 
 # TODO: future customizability dicts go here
@@ -25,22 +25,28 @@
 
 fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6,6))
 
-axes[0, 0].violinplot(data, pos, width=0.1)
+axes[0, 0].violinplot(data, pos, points=20, widths=0.1, showmeans=True,
+                      showextrema=True, showmedians=True)
 axes[0, 0].set_title('Custom violinplot 1', fontsize=fs)
 
-axes[0, 1].violinplot(data, pos, width=0.3)
+axes[0, 1].violinplot(data, pos, points=40, widths=0.3, showmeans=True,
+                      showextrema=True, showmedians=True)
 axes[0, 1].set_title('Custom violinplot 2', fontsize=fs)
 
-axes[0, 2].violinplot(data, pos, width=0.5)
+axes[0, 2].violinplot(data, pos, points=60, widths=0.5, showmeans=True,
+                      showextrema=True, showmedians=True)
 axes[0, 2].set_title('Custom violinplot 3', fontsize=fs)
 
-axes[1, 0].violinplot(data, pos, width=0.7)
+axes[1, 0].violinplot(data, pos, points=80, vert=False, widths=0.7,
+                      showmeans=True, showextrema=True, showmedians=True)
 axes[1, 0].set_title('Custom violinplot 4', fontsize=fs)
 
-axes[1, 1].violinplot(data, pos, width=0.9)
+axes[1, 1].violinplot(data, pos, points=100, vert=False, widths=0.9,
+                      showmeans=True, showextrema=True, showmedians=True)
 axes[1, 1].set_title('Custom violinplot 5', fontsize=fs)
 
-axes[1, 2].violinplot(data, pos, width=1.1)
+axes[1, 2].violinplot(data, pos, points=200, vert=False, widths=1.1,
+                      showmeans=True, showextrema=True, showmedians=True)
 axes[1, 2].set_title('Custom violinplot 6', fontsize=fs)
 
 for ax in axes.flatten():

diff --git a/lib/matplotlib/axes/_axes.py b/lib/matplotlib/axes/_axes.py
@@ -6725,7 +6725,7 @@ def matshow(self, Z, **kwargs):
                                                  integer=True))
         return im
 
-    def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=False,
+    def violinplot(self, dataset, positions=None, points=100, vert=True, widths=0.5, showmeans=False,
                    showextrema=True, showmedians=False):
         """
         Make a violin plot.
@@ -6748,6 +6748,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
           positions : array-like, default = [1, 2, ..., n]
             Sets the positions of the violins. The ticks and limits are
             automatically set to match the positions.
+          
+          points: array-like, default = 100
+            Number of points to evaluate pdf estimation for Gaussian kernel
 
           vert : bool, default = True.
             If true, creates vertical violin plot
@@ -6806,6 +6809,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
         cbars = None
         cmedians = None
 
+        datashape_message = ("List of violinplot statistics and `{0}` "
+                             "values must have same the length")
+
         # Validate positions
         if positions == None:
             positions = range(1, len(dataset) + 1)
@@ -6830,13 +6836,14 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
         # Render violins
         for d,p,w in zip(dataset,positions,widths):            
             # Calculate the kernel density
-            kde = mlab.ksdensity(d)
-            m = kde['xmin']
-            M = kde['xmax']
-            mean = kde['mean']
-            median = kde['median']
-            v = kde['result']
-            coords = np.arange(m,M,(M-m)/100.)
+            kde = mlab.gaussian_kde(d)
+            m = kde.dataset.min()
+            M = kde.dataset.max()
+            mean = np.mean(kde.dataset)
+            median = np.median(kde.dataset)
+            coords = np.arange(m,M,(M-m)/float(points))
+
+            v = kde.evaluate(coords)
 
             # Since each data point p is plotted from v-p to v+p,
             # we need to scale it by an additional 0.5 factor so that we get
@@ -6846,10 +6853,10 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
             # create vertical violin plot
             if vert:
                 bodies += [self.fill_betweenx(coords,
-                                          -v+p,
-                                          v+p,
-                                          facecolor='y',
-                                          alpha=0.3)]
+                                              -v+p,
+                                              v+p,
+                                              facecolor='y',
+                                              alpha=0.3)]
             # create horizontal violin plot
             else:
                 bodies += [self.fill_between(coords,
@@ -6895,10 +6902,6 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
             if showmedians:
                 cmedians = self.vlines(medians, pmins, pmaxes, colors='r')
 
-
-
-
-
         # Reset hold
         self.hold(holdStatus)
 

diff --git a/lib/matplotlib/mlab.py b/lib/matplotlib/mlab.py
@@ -3656,12 +3656,12 @@ def stineman_interp(xi,x,y,yp=None):
                                   1/(dy1+dy2),))
     return yi
 
-def ksdensity(dataset, bw_method=None):
+class gaussian_kde(object):    
     """
     Representation of a kernel-density estimate using Gaussian kernels.
 
     Call signature::
-    kde_dict = ksdensity(dataset, 'silverman')
+    kde = gaussian_kde(dataset, 'silverman')
 
     Parameters
     ----------
@@ -3676,10 +3676,10 @@ def ksdensity(dataset, bw_method=None):
     Attributes
     ----------
     dataset : ndarray
-        The dataset with which `ksdensity` was initialized.
-    d : int
+        The dataset with which `gaussian_kde` was initialized.
+    dim : int
         Number of dimensions.
-    n : int
+    num_dp : int
         Number of datapoints.
     factor : float
         The bandwidth factor, obtained from `kde.covariance_factor`, with which
@@ -3690,117 +3690,135 @@ def ksdensity(dataset, bw_method=None):
     inv_cov : ndarray
         The inverse of `covariance`.
 
-    Returns
+    Methods
     -------
-    A dictionary mapping each various aspects of the computed KDE.
-    The dictionary has the following keys:
-
-        xmin : number
-            The min of the input dataset
-        xmax : number
-            The max of the input dataset
-        mean : number
-            The mean of the result
-        median: number
-            The median of the result
-        result: (# of points,)-array
-            The array of the evaluated PDF estimation
-
-    Raises
-    ------
-    ValueError : if the dimensionality of the input points is different than
-                 the dimensionality of the KDE.
+    kde.evaluate(points) : ndarray
+        Evaluate the estimated pdf on a provided set of points.
+    kde(points) : ndarray
+        Same as kde.evaluate(points)
+    kde.set_bandwidth(bw_method='scott') : None
+        Computes the bandwidth, i.e. the coefficient that multiplies the data
+        covariance matrix to obtain the kernel covariance matrix.
+        .. versionadded:: 0.11.0
+    kde.covariance_factor : float
+        Computes the coefficient (`kde.factor`) that multiplies the data
+        covariance matrix to obtain the kernel covariance matrix.
+        The default is `scotts_factor`.  A subclass can overwrite this method
+        to provide a different method, or set it through a call to
+        `kde.set_bandwidth`.
 
     """
 
     # This implementation with minor modification was too good to pass up.
     # from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
 
-    dataset = np.array(np.atleast_2d(dataset))
-    xmin = dataset.min()
-    xmax = dataset.max()
+    def __init__(self, dataset, bw_method=None):
+        self.dataset = np.atleast_2d(dataset)
+        if not self.dataset.size > 1:
+            raise ValueError("`dataset` input should have multiple elements.")
 
-    if not dataset.size > 1:
-        raise ValueError("`dataset` input should have multiple elements.")
+        self.dim, self.num_dp = self.dataset.shape
+        self.set_bandwidth(bw_method=bw_method)
 
-    dim, num_dp = dataset.shape
+    def scotts_factor(self):
+        return np.power(self.num_dp, -1./(self.dim+4))
 
-    # ----------------------------------------------
-    # Set Bandwidth, defaulted to Scott's Factor
-    # ----------------------------------------------
-    scotts_factor = lambda: np.power(num_dp, -1./(dim+4))
-    silverman_factor = lambda: np.power(num_dp*(dim+2.0)/4.0, -1./(dim+4))
+    def silverman_factor(self):
+        return np.power(self.num_dp*(self.dim+2.0)/4.0, -1./(self.dim+4))
 
-    # Default method to calculate bandwidth, can be overwritten by subclass
+    #  Default method to calculate bandwidth, can be overwritten by subclass
     covariance_factor = scotts_factor
 
-    if bw_method is None:
-        pass
-    elif bw_method == 'scott':
-        covariance_factor = scotts_factor
-    elif bw_method == 'silverman':
-        covariance_factor = silverman_factor
-    elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types):
-        covariance_factor = lambda: bw_method
-    else:
-        msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
-        raise ValueError(msg)
-
-    # ---------------------------------------------------------------
-    # Computes covariance matrix for each Gaussian kernel with factor
-    # ---------------------------------------------------------------
-    factor = covariance_factor()
-
-    # Cache covariance and inverse covariance of the data
-    data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1, bias=False))
-    data_inv_cov = np.linalg.inv(data_covariance)
-
-    covariance = data_covariance * factor**2
-    inv_cov = data_inv_cov / factor**2
-    norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * num_dp
-
-    # ----------------------------------------------
-    # Evaluate the estimated pdf on a set of points.
-    # ----------------------------------------------
-    points = np.atleast_2d(np.arange(xmin, xmax, (xmax-xmin)/100.))
-
-    dim_pts, num_dp_pts = np.array(points).shape
-    if dim_pts != dim:
-        if dim_pts == 1 and num_dp_pts == num_dp:
-            # points was passed in as a row vector
-            points = np.reshape(points, (dim, 1))
-            num_dp_pts = 1
+    def set_bandwidth(self, bw_method=None):
+        if bw_method is None:
+            pass
+        elif bw_method == 'scott':
+            self.covariance_factor = self.scotts_factor
+        elif bw_method == 'silverman':
+            self.covariance_factor = self.silverman_factor
+        elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types):
+            self._bw_method = 'use constant'
+            self.covariance_factor = lambda: bw_method
+        elif callable(bw_method):
+            self._bw_method = bw_method
+            self.covariance_factor = lambda: self._bw_method(self)
         else:
-            msg = "points have dimension %s,\
-                   dataset has dimension %s" % (dim_pts, dim)
+            msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
+                  "or a callable."
             raise ValueError(msg)
 
-    result = np.zeros((num_dp_pts,), dtype=np.float)
+        self._compute_covariance()
 
-    if num_dp_pts >= num_dp:
-        # there are more points than data, so loop over data
-        for i in range(num_dp):
-            diff = dataset[:, i, np.newaxis] - points
-            tdiff = np.dot(inv_cov, diff)
-            energy = np.sum(diff*tdiff, axis=0) / 2.0
-            result = result + np.exp(-energy)
-    else:
-        # loop over points
-        for i in range(num_dp_pts):
-            diff = dataset - points[:, i, np.newaxis]
-            tdiff = np.dot(inv_cov, diff)
-            energy = np.sum(diff * tdiff, axis=0) / 2.0
-            result[i] = np.sum(np.exp(-energy), axis=0)
-
-    result = result / norm_factor
-
-    return {
-        'xmin': xmin,
-        'xmax': xmax,
-        'mean': np.mean(dataset),
-        'median': np.median(dataset),
-        'result': result
-    }
+    def _compute_covariance(self):
+        """Computes the covariance matrix for each Gaussian kernel using
+        covariance_factor().
+        """
+        self.factor = self.covariance_factor()
+        # Cache covariance and inverse covariance of the data
+        if not hasattr(self, '_data_inv_cov'):
+            self._data_covariance = np.atleast_2d(np.cov(self.dataset, rowvar=1,
+                                               bias=False))
+            self._data_inv_cov = np.linalg.inv(self._data_covariance)
+
+        self.covariance = self._data_covariance * self.factor**2
+        self.inv_cov = self._data_inv_cov / self.factor**2
+        self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.num_dp        
+
+    def evaluate(self, points):
+        """Evaluate the estimated pdf on a set of points.
+
+        Parameters
+        ----------
+        points : (# of dimensions, # of points)-array
+            Alternatively, a (# of dimensions,) vector can be passed in and
+            treated as a single point.
+
+        Returns
+        -------
+        values : (# of points,)-array
+            The values at each point.
+
+        Raises
+        ------
+        ValueError : if the dimensionality of the input points is different than
+                     the dimensionality of the KDE.
+
+        """
+        points = np.atleast_2d(points)
+
+        d, m = points.shape
+        if d != self.dim:
+            if d == 1 and m == self.dim:
+                # points was passed in as a row vector
+                points = np.reshape(points, (self.dim, 1))
+                m = 1
+            else:
+                msg = "points have dimension %s, dataset has dimension %s" % (d,
+                    self.dim)
+                raise ValueError(msg)
+
+        result = np.zeros((m,), dtype=np.float)
+
+        if m >= self.num_dp:
+            # there are more points than data, so loop over data
+            for i in range(self.num_dp):
+                diff = self.dataset[:, i, np.newaxis] - points
+                tdiff = np.dot(self.inv_cov, diff)
+                energy = np.sum(diff*tdiff,axis=0) / 2.0
+                result = result + np.exp(-energy)
+        else:
+            # loop over points
+            for i in range(m):
+                diff = self.dataset - points[:, i, np.newaxis]
+                tdiff = np.dot(self.inv_cov, diff)
+                energy = np.sum(diff * tdiff, axis=0) / 2.0
+                result[i] = np.sum(np.exp(-energy), axis=0)
+
+        result = result / self._norm_factor
+
+        return result
+
+    __call__ = evaluate
 
 ##################################################
 # Code related to things in and around polygons