Skip to content

Commit

Permalink
Fixed bug for regression test matplotlib#1181 in scipy unit tests; ks…
Browse files Browse the repository at this point in the history
…density is now referred to as gaussian_kde and exists as a class in mlab.

Fixed list comp position bug and updated examples
  • Loading branch information
khchan authored and solvents committed May 24, 2014
1 parent 3c4c619 commit e6f1b38
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 122 deletions.
20 changes: 13 additions & 7 deletions examples/statistics/violinplot_demo.py
Expand Up @@ -8,7 +8,7 @@

# fake data
fs = 10 # fontsize
pos = range(5)
pos = [1,2,4,5,7,8]
data = [np.random.normal(size=100) for i in pos]

# TODO: future customizability dicts go here
Expand All @@ -25,22 +25,28 @@

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6,6))

axes[0, 0].violinplot(data, pos, width=0.1)
axes[0, 0].violinplot(data, pos, points=20, widths=0.1, showmeans=True,
showextrema=True, showmedians=True)
axes[0, 0].set_title('Custom violinplot 1', fontsize=fs)

axes[0, 1].violinplot(data, pos, width=0.3)
axes[0, 1].violinplot(data, pos, points=40, widths=0.3, showmeans=True,
showextrema=True, showmedians=True)
axes[0, 1].set_title('Custom violinplot 2', fontsize=fs)

axes[0, 2].violinplot(data, pos, width=0.5)
axes[0, 2].violinplot(data, pos, points=60, widths=0.5, showmeans=True,
showextrema=True, showmedians=True)
axes[0, 2].set_title('Custom violinplot 3', fontsize=fs)

axes[1, 0].violinplot(data, pos, width=0.7)
axes[1, 0].violinplot(data, pos, points=80, vert=False, widths=0.7,
showmeans=True, showextrema=True, showmedians=True)
axes[1, 0].set_title('Custom violinplot 4', fontsize=fs)

axes[1, 1].violinplot(data, pos, width=0.9)
axes[1, 1].violinplot(data, pos, points=100, vert=False, widths=0.9,
showmeans=True, showextrema=True, showmedians=True)
axes[1, 1].set_title('Custom violinplot 5', fontsize=fs)

axes[1, 2].violinplot(data, pos, width=1.1)
axes[1, 2].violinplot(data, pos, points=200, vert=False, widths=1.1,
showmeans=True, showextrema=True, showmedians=True)
axes[1, 2].set_title('Custom violinplot 6', fontsize=fs)

for ax in axes.flatten():
Expand Down
35 changes: 19 additions & 16 deletions lib/matplotlib/axes/_axes.py
Expand Up @@ -6725,7 +6725,7 @@ def matshow(self, Z, **kwargs):
integer=True))
return im

def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=False,
def violinplot(self, dataset, positions=None, points=100, vert=True, widths=0.5, showmeans=False,
showextrema=True, showmedians=False):
"""
Make a violin plot.
Expand All @@ -6748,6 +6748,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
positions : array-like, default = [1, 2, ..., n]
Sets the positions of the violins. The ticks and limits are
automatically set to match the positions.
points: array-like, default = 100
Number of points to evaluate pdf estimation for Gaussian kernel
vert : bool, default = True.
If true, creates vertical violin plot
Expand Down Expand Up @@ -6806,6 +6809,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
cbars = None
cmedians = None

datashape_message = ("List of violinplot statistics and `{0}` "
"values must have same the length")

# Validate positions
if positions == None:
positions = range(1, len(dataset) + 1)
Expand All @@ -6830,13 +6836,14 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
# Render violins
for d,p,w in zip(dataset,positions,widths):
# Calculate the kernel density
kde = mlab.ksdensity(d)
m = kde['xmin']
M = kde['xmax']
mean = kde['mean']
median = kde['median']
v = kde['result']
coords = np.arange(m,M,(M-m)/100.)
kde = mlab.gaussian_kde(d)
m = kde.dataset.min()
M = kde.dataset.max()
mean = np.mean(kde.dataset)
median = np.median(kde.dataset)
coords = np.arange(m,M,(M-m)/float(points))

v = kde.evaluate(coords)

# Since each data point p is plotted from v-p to v+p,
# we need to scale it by an additional 0.5 factor so that we get
Expand All @@ -6846,10 +6853,10 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
# create vertical violin plot
if vert:
bodies += [self.fill_betweenx(coords,
-v+p,
v+p,
facecolor='y',
alpha=0.3)]
-v+p,
v+p,
facecolor='y',
alpha=0.3)]
# create horizontal violin plot
else:
bodies += [self.fill_between(coords,
Expand Down Expand Up @@ -6895,10 +6902,6 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
if showmedians:
cmedians = self.vlines(medians, pmins, pmaxes, colors='r')





# Reset hold
self.hold(holdStatus)

Expand Down
216 changes: 117 additions & 99 deletions lib/matplotlib/mlab.py
Expand Up @@ -3656,12 +3656,12 @@ def stineman_interp(xi,x,y,yp=None):
1/(dy1+dy2),))
return yi

def ksdensity(dataset, bw_method=None):
class gaussian_kde(object):
"""
Representation of a kernel-density estimate using Gaussian kernels.
Call signature::
kde_dict = ksdensity(dataset, 'silverman')
kde = gaussian_kde(dataset, 'silverman')
Parameters
----------
Expand All @@ -3676,10 +3676,10 @@ def ksdensity(dataset, bw_method=None):
Attributes
----------
dataset : ndarray
The dataset with which `ksdensity` was initialized.
d : int
The dataset with which `gaussian_kde` was initialized.
dim : int
Number of dimensions.
n : int
num_dp : int
Number of datapoints.
factor : float
The bandwidth factor, obtained from `kde.covariance_factor`, with which
Expand All @@ -3690,117 +3690,135 @@ def ksdensity(dataset, bw_method=None):
inv_cov : ndarray
The inverse of `covariance`.
Returns
Methods
-------
A dictionary mapping each various aspects of the computed KDE.
The dictionary has the following keys:
xmin : number
The min of the input dataset
xmax : number
The max of the input dataset
mean : number
The mean of the result
median: number
The median of the result
result: (# of points,)-array
The array of the evaluated PDF estimation
Raises
------
ValueError : if the dimensionality of the input points is different than
the dimensionality of the KDE.
kde.evaluate(points) : ndarray
Evaluate the estimated pdf on a provided set of points.
kde(points) : ndarray
Same as kde.evaluate(points)
kde.set_bandwidth(bw_method='scott') : None
Computes the bandwidth, i.e. the coefficient that multiplies the data
covariance matrix to obtain the kernel covariance matrix.
.. versionadded:: 0.11.0
kde.covariance_factor : float
Computes the coefficient (`kde.factor`) that multiplies the data
covariance matrix to obtain the kernel covariance matrix.
The default is `scotts_factor`. A subclass can overwrite this method
to provide a different method, or set it through a call to
`kde.set_bandwidth`.
"""

# This implementation with minor modification was too good to pass up.
# from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py

dataset = np.array(np.atleast_2d(dataset))
xmin = dataset.min()
xmax = dataset.max()
def __init__(self, dataset, bw_method=None):
self.dataset = np.atleast_2d(dataset)
if not self.dataset.size > 1:
raise ValueError("`dataset` input should have multiple elements.")

if not dataset.size > 1:
raise ValueError("`dataset` input should have multiple elements.")
self.dim, self.num_dp = self.dataset.shape
self.set_bandwidth(bw_method=bw_method)

dim, num_dp = dataset.shape
def scotts_factor(self):
return np.power(self.num_dp, -1./(self.dim+4))

# ----------------------------------------------
# Set Bandwidth, defaulted to Scott's Factor
# ----------------------------------------------
scotts_factor = lambda: np.power(num_dp, -1./(dim+4))
silverman_factor = lambda: np.power(num_dp*(dim+2.0)/4.0, -1./(dim+4))
def silverman_factor(self):
return np.power(self.num_dp*(self.dim+2.0)/4.0, -1./(self.dim+4))

# Default method to calculate bandwidth, can be overwritten by subclass
# Default method to calculate bandwidth, can be overwritten by subclass
covariance_factor = scotts_factor

if bw_method is None:
pass
elif bw_method == 'scott':
covariance_factor = scotts_factor
elif bw_method == 'silverman':
covariance_factor = silverman_factor
elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types):
covariance_factor = lambda: bw_method
else:
msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
raise ValueError(msg)

# ---------------------------------------------------------------
# Computes covariance matrix for each Gaussian kernel with factor
# ---------------------------------------------------------------
factor = covariance_factor()

# Cache covariance and inverse covariance of the data
data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1, bias=False))
data_inv_cov = np.linalg.inv(data_covariance)

covariance = data_covariance * factor**2
inv_cov = data_inv_cov / factor**2
norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * num_dp

# ----------------------------------------------
# Evaluate the estimated pdf on a set of points.
# ----------------------------------------------
points = np.atleast_2d(np.arange(xmin, xmax, (xmax-xmin)/100.))

dim_pts, num_dp_pts = np.array(points).shape
if dim_pts != dim:
if dim_pts == 1 and num_dp_pts == num_dp:
# points was passed in as a row vector
points = np.reshape(points, (dim, 1))
num_dp_pts = 1
def set_bandwidth(self, bw_method=None):
if bw_method is None:
pass
elif bw_method == 'scott':
self.covariance_factor = self.scotts_factor
elif bw_method == 'silverman':
self.covariance_factor = self.silverman_factor
elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types):
self._bw_method = 'use constant'
self.covariance_factor = lambda: bw_method
elif callable(bw_method):
self._bw_method = bw_method
self.covariance_factor = lambda: self._bw_method(self)
else:
msg = "points have dimension %s,\
dataset has dimension %s" % (dim_pts, dim)
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
"or a callable."
raise ValueError(msg)

result = np.zeros((num_dp_pts,), dtype=np.float)
self._compute_covariance()

if num_dp_pts >= num_dp:
# there are more points than data, so loop over data
for i in range(num_dp):
diff = dataset[:, i, np.newaxis] - points
tdiff = np.dot(inv_cov, diff)
energy = np.sum(diff*tdiff, axis=0) / 2.0
result = result + np.exp(-energy)
else:
# loop over points
for i in range(num_dp_pts):
diff = dataset - points[:, i, np.newaxis]
tdiff = np.dot(inv_cov, diff)
energy = np.sum(diff * tdiff, axis=0) / 2.0
result[i] = np.sum(np.exp(-energy), axis=0)

result = result / norm_factor

return {
'xmin': xmin,
'xmax': xmax,
'mean': np.mean(dataset),
'median': np.median(dataset),
'result': result
}
def _compute_covariance(self):
"""Computes the covariance matrix for each Gaussian kernel using
covariance_factor().
"""
self.factor = self.covariance_factor()
# Cache covariance and inverse covariance of the data
if not hasattr(self, '_data_inv_cov'):
self._data_covariance = np.atleast_2d(np.cov(self.dataset, rowvar=1,
bias=False))
self._data_inv_cov = np.linalg.inv(self._data_covariance)

self.covariance = self._data_covariance * self.factor**2
self.inv_cov = self._data_inv_cov / self.factor**2
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.num_dp

def evaluate(self, points):
"""Evaluate the estimated pdf on a set of points.
Parameters
----------
points : (# of dimensions, # of points)-array
Alternatively, a (# of dimensions,) vector can be passed in and
treated as a single point.
Returns
-------
values : (# of points,)-array
The values at each point.
Raises
------
ValueError : if the dimensionality of the input points is different than
the dimensionality of the KDE.
"""
points = np.atleast_2d(points)

d, m = points.shape
if d != self.dim:
if d == 1 and m == self.dim:
# points was passed in as a row vector
points = np.reshape(points, (self.dim, 1))
m = 1
else:
msg = "points have dimension %s, dataset has dimension %s" % (d,
self.dim)
raise ValueError(msg)

result = np.zeros((m,), dtype=np.float)

if m >= self.num_dp:
# there are more points than data, so loop over data
for i in range(self.num_dp):
diff = self.dataset[:, i, np.newaxis] - points
tdiff = np.dot(self.inv_cov, diff)
energy = np.sum(diff*tdiff,axis=0) / 2.0
result = result + np.exp(-energy)
else:
# loop over points
for i in range(m):
diff = self.dataset - points[:, i, np.newaxis]
tdiff = np.dot(self.inv_cov, diff)
energy = np.sum(diff * tdiff, axis=0) / 2.0
result[i] = np.sum(np.exp(-energy), axis=0)

result = result / self._norm_factor

return result

__call__ = evaluate

##################################################
# Code related to things in and around polygons
Expand Down

0 comments on commit e6f1b38

Please sign in to comment.