From e6f1b3828350868a754987cc775b3bb07cfb1d09 Mon Sep 17 00:00:00 2001 From: Kevin Chan Date: Tue, 1 Apr 2014 20:42:18 -0400 Subject: [PATCH] Fixed bug for regression test #1181 in scipy unit tests; ksdensity is now referred to as gaussian_kde and exists as a class in mlab. Fixed list comp position bug and updated examples --- examples/statistics/violinplot_demo.py | 20 ++- lib/matplotlib/axes/_axes.py | 35 ++-- lib/matplotlib/mlab.py | 216 +++++++++++++------------ 3 files changed, 149 insertions(+), 122 deletions(-) diff --git a/examples/statistics/violinplot_demo.py b/examples/statistics/violinplot_demo.py index 840715ae9537..93aa50e44e5f 100644 --- a/examples/statistics/violinplot_demo.py +++ b/examples/statistics/violinplot_demo.py @@ -8,7 +8,7 @@ # fake data fs = 10 # fontsize -pos = range(5) +pos = [1,2,4,5,7,8] data = [np.random.normal(size=100) for i in pos] # TODO: future customizability dicts go here @@ -25,22 +25,28 @@ fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6,6)) -axes[0, 0].violinplot(data, pos, width=0.1) +axes[0, 0].violinplot(data, pos, points=20, widths=0.1, showmeans=True, + showextrema=True, showmedians=True) axes[0, 0].set_title('Custom violinplot 1', fontsize=fs) -axes[0, 1].violinplot(data, pos, width=0.3) +axes[0, 1].violinplot(data, pos, points=40, widths=0.3, showmeans=True, + showextrema=True, showmedians=True) axes[0, 1].set_title('Custom violinplot 2', fontsize=fs) -axes[0, 2].violinplot(data, pos, width=0.5) +axes[0, 2].violinplot(data, pos, points=60, widths=0.5, showmeans=True, + showextrema=True, showmedians=True) axes[0, 2].set_title('Custom violinplot 3', fontsize=fs) -axes[1, 0].violinplot(data, pos, width=0.7) +axes[1, 0].violinplot(data, pos, points=80, vert=False, widths=0.7, + showmeans=True, showextrema=True, showmedians=True) axes[1, 0].set_title('Custom violinplot 4', fontsize=fs) -axes[1, 1].violinplot(data, pos, width=0.9) +axes[1, 1].violinplot(data, pos, points=100, vert=False, widths=0.9, + showmeans=True, showextrema=True, showmedians=True) axes[1, 1].set_title('Custom violinplot 5', fontsize=fs) -axes[1, 2].violinplot(data, pos, width=1.1) +axes[1, 2].violinplot(data, pos, points=200, vert=False, widths=1.1, + showmeans=True, showextrema=True, showmedians=True) axes[1, 2].set_title('Custom violinplot 6', fontsize=fs) for ax in axes.flatten(): diff --git a/lib/matplotlib/axes/_axes.py b/lib/matplotlib/axes/_axes.py index fb0e21b54362..e7834b5c7b5a 100644 --- a/lib/matplotlib/axes/_axes.py +++ b/lib/matplotlib/axes/_axes.py @@ -6725,7 +6725,7 @@ def matshow(self, Z, **kwargs): integer=True)) return im - def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=False, + def violinplot(self, dataset, positions=None, points=100, vert=True, widths=0.5, showmeans=False, showextrema=True, showmedians=False): """ Make a violin plot. @@ -6748,6 +6748,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F positions : array-like, default = [1, 2, ..., n] Sets the positions of the violins. The ticks and limits are automatically set to match the positions. + + points: array-like, default = 100 + Number of points to evaluate pdf estimation for Gaussian kernel vert : bool, default = True. If true, creates vertical violin plot @@ -6806,6 +6809,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F cbars = None cmedians = None + datashape_message = ("List of violinplot statistics and `{0}` " + "values must have same the length") + # Validate positions if positions == None: positions = range(1, len(dataset) + 1) @@ -6830,13 +6836,14 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F # Render violins for d,p,w in zip(dataset,positions,widths): # Calculate the kernel density - kde = mlab.ksdensity(d) - m = kde['xmin'] - M = kde['xmax'] - mean = kde['mean'] - median = kde['median'] - v = kde['result'] - coords = np.arange(m,M,(M-m)/100.) + kde = mlab.gaussian_kde(d) + m = kde.dataset.min() + M = kde.dataset.max() + mean = np.mean(kde.dataset) + median = np.median(kde.dataset) + coords = np.arange(m,M,(M-m)/float(points)) + + v = kde.evaluate(coords) # Since each data point p is plotted from v-p to v+p, # we need to scale it by an additional 0.5 factor so that we get @@ -6846,10 +6853,10 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F # create vertical violin plot if vert: bodies += [self.fill_betweenx(coords, - -v+p, - v+p, - facecolor='y', - alpha=0.3)] + -v+p, + v+p, + facecolor='y', + alpha=0.3)] # create horizontal violin plot else: bodies += [self.fill_between(coords, @@ -6895,10 +6902,6 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F if showmedians: cmedians = self.vlines(medians, pmins, pmaxes, colors='r') - - - - # Reset hold self.hold(holdStatus) diff --git a/lib/matplotlib/mlab.py b/lib/matplotlib/mlab.py index af8cc9a79734..bf5a41a1fba9 100644 --- a/lib/matplotlib/mlab.py +++ b/lib/matplotlib/mlab.py @@ -3656,12 +3656,12 @@ def stineman_interp(xi,x,y,yp=None): 1/(dy1+dy2),)) return yi -def ksdensity(dataset, bw_method=None): +class gaussian_kde(object): """ Representation of a kernel-density estimate using Gaussian kernels. Call signature:: - kde_dict = ksdensity(dataset, 'silverman') + kde = gaussian_kde(dataset, 'silverman') Parameters ---------- @@ -3676,10 +3676,10 @@ def ksdensity(dataset, bw_method=None): Attributes ---------- dataset : ndarray - The dataset with which `ksdensity` was initialized. - d : int + The dataset with which `gaussian_kde` was initialized. + dim : int Number of dimensions. - n : int + num_dp : int Number of datapoints. factor : float The bandwidth factor, obtained from `kde.covariance_factor`, with which @@ -3690,117 +3690,135 @@ def ksdensity(dataset, bw_method=None): inv_cov : ndarray The inverse of `covariance`. - Returns + Methods ------- - A dictionary mapping each various aspects of the computed KDE. - The dictionary has the following keys: - - xmin : number - The min of the input dataset - xmax : number - The max of the input dataset - mean : number - The mean of the result - median: number - The median of the result - result: (# of points,)-array - The array of the evaluated PDF estimation - - Raises - ------ - ValueError : if the dimensionality of the input points is different than - the dimensionality of the KDE. + kde.evaluate(points) : ndarray + Evaluate the estimated pdf on a provided set of points. + kde(points) : ndarray + Same as kde.evaluate(points) + kde.set_bandwidth(bw_method='scott') : None + Computes the bandwidth, i.e. the coefficient that multiplies the data + covariance matrix to obtain the kernel covariance matrix. + .. versionadded:: 0.11.0 + kde.covariance_factor : float + Computes the coefficient (`kde.factor`) that multiplies the data + covariance matrix to obtain the kernel covariance matrix. + The default is `scotts_factor`. A subclass can overwrite this method + to provide a different method, or set it through a call to + `kde.set_bandwidth`. """ # This implementation with minor modification was too good to pass up. # from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py - dataset = np.array(np.atleast_2d(dataset)) - xmin = dataset.min() - xmax = dataset.max() + def __init__(self, dataset, bw_method=None): + self.dataset = np.atleast_2d(dataset) + if not self.dataset.size > 1: + raise ValueError("`dataset` input should have multiple elements.") - if not dataset.size > 1: - raise ValueError("`dataset` input should have multiple elements.") + self.dim, self.num_dp = self.dataset.shape + self.set_bandwidth(bw_method=bw_method) - dim, num_dp = dataset.shape + def scotts_factor(self): + return np.power(self.num_dp, -1./(self.dim+4)) - # ---------------------------------------------- - # Set Bandwidth, defaulted to Scott's Factor - # ---------------------------------------------- - scotts_factor = lambda: np.power(num_dp, -1./(dim+4)) - silverman_factor = lambda: np.power(num_dp*(dim+2.0)/4.0, -1./(dim+4)) + def silverman_factor(self): + return np.power(self.num_dp*(self.dim+2.0)/4.0, -1./(self.dim+4)) - # Default method to calculate bandwidth, can be overwritten by subclass + # Default method to calculate bandwidth, can be overwritten by subclass covariance_factor = scotts_factor - if bw_method is None: - pass - elif bw_method == 'scott': - covariance_factor = scotts_factor - elif bw_method == 'silverman': - covariance_factor = silverman_factor - elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types): - covariance_factor = lambda: bw_method - else: - msg = "`bw_method` should be 'scott', 'silverman', or a scalar" - raise ValueError(msg) - - # --------------------------------------------------------------- - # Computes covariance matrix for each Gaussian kernel with factor - # --------------------------------------------------------------- - factor = covariance_factor() - - # Cache covariance and inverse covariance of the data - data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1, bias=False)) - data_inv_cov = np.linalg.inv(data_covariance) - - covariance = data_covariance * factor**2 - inv_cov = data_inv_cov / factor**2 - norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * num_dp - - # ---------------------------------------------- - # Evaluate the estimated pdf on a set of points. - # ---------------------------------------------- - points = np.atleast_2d(np.arange(xmin, xmax, (xmax-xmin)/100.)) - - dim_pts, num_dp_pts = np.array(points).shape - if dim_pts != dim: - if dim_pts == 1 and num_dp_pts == num_dp: - # points was passed in as a row vector - points = np.reshape(points, (dim, 1)) - num_dp_pts = 1 + def set_bandwidth(self, bw_method=None): + if bw_method is None: + pass + elif bw_method == 'scott': + self.covariance_factor = self.scotts_factor + elif bw_method == 'silverman': + self.covariance_factor = self.silverman_factor + elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types): + self._bw_method = 'use constant' + self.covariance_factor = lambda: bw_method + elif callable(bw_method): + self._bw_method = bw_method + self.covariance_factor = lambda: self._bw_method(self) else: - msg = "points have dimension %s,\ - dataset has dimension %s" % (dim_pts, dim) + msg = "`bw_method` should be 'scott', 'silverman', a scalar " \ + "or a callable." raise ValueError(msg) - result = np.zeros((num_dp_pts,), dtype=np.float) + self._compute_covariance() - if num_dp_pts >= num_dp: - # there are more points than data, so loop over data - for i in range(num_dp): - diff = dataset[:, i, np.newaxis] - points - tdiff = np.dot(inv_cov, diff) - energy = np.sum(diff*tdiff, axis=0) / 2.0 - result = result + np.exp(-energy) - else: - # loop over points - for i in range(num_dp_pts): - diff = dataset - points[:, i, np.newaxis] - tdiff = np.dot(inv_cov, diff) - energy = np.sum(diff * tdiff, axis=0) / 2.0 - result[i] = np.sum(np.exp(-energy), axis=0) - - result = result / norm_factor - - return { - 'xmin': xmin, - 'xmax': xmax, - 'mean': np.mean(dataset), - 'median': np.median(dataset), - 'result': result - } + def _compute_covariance(self): + """Computes the covariance matrix for each Gaussian kernel using + covariance_factor(). + """ + self.factor = self.covariance_factor() + # Cache covariance and inverse covariance of the data + if not hasattr(self, '_data_inv_cov'): + self._data_covariance = np.atleast_2d(np.cov(self.dataset, rowvar=1, + bias=False)) + self._data_inv_cov = np.linalg.inv(self._data_covariance) + + self.covariance = self._data_covariance * self.factor**2 + self.inv_cov = self._data_inv_cov / self.factor**2 + self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.num_dp + + def evaluate(self, points): + """Evaluate the estimated pdf on a set of points. + + Parameters + ---------- + points : (# of dimensions, # of points)-array + Alternatively, a (# of dimensions,) vector can be passed in and + treated as a single point. + + Returns + ------- + values : (# of points,)-array + The values at each point. + + Raises + ------ + ValueError : if the dimensionality of the input points is different than + the dimensionality of the KDE. + + """ + points = np.atleast_2d(points) + + d, m = points.shape + if d != self.dim: + if d == 1 and m == self.dim: + # points was passed in as a row vector + points = np.reshape(points, (self.dim, 1)) + m = 1 + else: + msg = "points have dimension %s, dataset has dimension %s" % (d, + self.dim) + raise ValueError(msg) + + result = np.zeros((m,), dtype=np.float) + + if m >= self.num_dp: + # there are more points than data, so loop over data + for i in range(self.num_dp): + diff = self.dataset[:, i, np.newaxis] - points + tdiff = np.dot(self.inv_cov, diff) + energy = np.sum(diff*tdiff,axis=0) / 2.0 + result = result + np.exp(-energy) + else: + # loop over points + for i in range(m): + diff = self.dataset - points[:, i, np.newaxis] + tdiff = np.dot(self.inv_cov, diff) + energy = np.sum(diff * tdiff, axis=0) / 2.0 + result[i] = np.sum(np.exp(-energy), axis=0) + + result = result / self._norm_factor + + return result + + __call__ = evaluate ################################################## # Code related to things in and around polygons