-
Notifications
You must be signed in to change notification settings - Fork 10
/
viz.py
525 lines (418 loc) · 17 KB
/
viz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
import copy
import numpy
from matplotlib import pyplot
from .probscale import _minimal_norm
from . import validate
from . import algo
def probplot(data, ax=None, plottype='prob', dist=None, probax='x',
problabel=None, datascale='linear', datalabel=None,
bestfit=False, return_best_fit_results=False,
estimate_ci=False, ci_kws=None, pp_kws=None,
scatter_kws=None, line_kws=None, **fgkwargs):
"""
Probability, percentile, and quantile plots.
Parameters
----------
data : array-like
1-dimensional data to be plotted
ax : matplotlib axes, optional
The Axes on which to plot. If one is not provided, a new Axes
will be created.
plottype : string (default = 'prob')
Type of plot to be created. Options are:
- 'prob': probabilty plot
- 'pp': percentile plot
- 'qq': quantile plot
dist : scipy distribution, optional
A distribtion to compute the scale's tick positions. If not
specified, a standard normal distribution will be used.
probax : string, optional (default = 'x')
The axis ('x' or 'y') that will serve as the probability (or
quantile) axis.
problabel, datalabel : string, optional
Axis labels for the probability/quantile and data axes
respectively.
datascale : string, optional (default = 'log')
Scale for the other axis that is not
bestfit : bool, optional (default is False)
Specifies whether a best-fit line should be added to the plot.
return_best_fit_results : bool (default is False)
If True a dictionary of results of is returned along with the
figure.
estimate_ci : bool, optional (False)
Estimate and draw a confidence band around the best-fit line
using a percentile bootstrap.
ci_kws : dict, optional
Dictionary of keyword arguments passed directly to
``viz.fit_line`` when computing the best-fit line.
pp_kws : dict, optional
Dictionary of keyword arguments passed directly to
``viz.plot_pos`` when computing the plotting positions.
scatter_kws, line_kws : dict, optional
Dictionary of keyword arguments passed directly to ``ax.plot``
when drawing the scatter points and best-fit line, respectively.
Other Parameters
----------------
color : string, optional
A directly-specified matplotlib color argument for both the
data series and the best-fit line if drawn. This argument is
made available for compatibility for the seaborn package and
is not recommended for general use. Instead colors should be
specified within ``scatter_kws`` and ``line_kws``.
.. note::
Users should not specify this parameter. It is inteded to
only be used by seaborn when operating within a
``FacetGrid``.
label : string, optional
A directly-specified legend label for the data series. This
argument is made available for compatibility for the seaborn
package and is not recommended for general use. Instead the
data series label should be specified within ``scatter_kws``.
.. note::
Users should not specify this parameter. It is inteded to
only be used by seaborn when operating within a
``FacetGrid``.
Returns
-------
fig : matplotlib.Figure
The figure on which the plot was drawn.
result : dict of linear fit results, optional
Keys are:
- q : array of quantiles
- x, y : arrays of data passed to function
- xhat, yhat : arrays of modeled data plotted in best-fit line
- res : array of coeffcients of the best-fit line.
See also
--------
viz.plot_pos
viz.fit_line
numpy.polyfit
scipy.stats.probplot
scipy.stats.mstats.plotting_positions
Examples
--------
Probability plot with the probabilities on the y-axis
.. plot::
:context: close-figs
>>> import numpy; numpy.random.seed(0)
>>> from matplotlib import pyplot
>>> from scipy import stats
>>> from probscale.viz import probplot
>>> data = numpy.random.normal(loc=5, scale=1.25, size=37)
>>> fig = probplot(data, plottype='prob', probax='y',
... problabel='Non-exceedance probability',
... datalabel='Observed values', bestfit=True,
... line_kws=dict(linestyle='--', linewidth=2),
... scatter_kws=dict(marker='o', alpha=0.5))
Quantile plot with the quantiles on the x-axis
.. plot::
:context: close-figs
>>> fig = probplot(data, plottype='qq', probax='x',
... problabel='Theoretical Quantiles',
... datalabel='Observed values', bestfit=True,
... line_kws=dict(linestyle='-', linewidth=2),
... scatter_kws=dict(marker='s', alpha=0.5))
"""
if dist is None:
dist = _minimal_norm
# check input values
fig, ax = validate.axes_object(ax)
probax = validate.axis_name(probax, 'probability axis')
problabel = validate.axis_label(problabel)
datalabel = validate.axis_label(datalabel)
# default values for symbology options
scatter_kws = validate.other_options(scatter_kws)
line_kws = validate.other_options(line_kws)
pp_kws = validate.other_options(pp_kws)
# check plottype
plottype = validate.axis_type(plottype)
# !-- kwarg that only seaborn should use --!
_color = fgkwargs.get('color', None)
if _color is not None:
scatter_kws['color'] = _color
line_kws['color'] = _color
# !-- kwarg that only seaborn should use --!
_label = fgkwargs.get('label', None)
if _label is not None:
scatter_kws['label'] = _label
# !-- kwarg that only seaborn should use --!
_marker = fgkwargs.get('marker', None)
if _marker is not None:
scatter_kws['marker'] = _marker
# compute the plotting positions and sort the data
probs, datavals = plot_pos(data, **pp_kws)
qntls = dist.ppf(probs)
# determine how the probability values should be expressed
if plottype == 'qq':
probvals = qntls
else:
probvals = probs * 100
# set up x, y, Axes for probabilities on the x
if probax == 'x':
x, y = probvals, datavals
ax.set_xlabel(problabel)
ax.set_ylabel(datalabel)
if plottype == 'prob':
ax.set_xscale('prob', dist=dist)
fitprobs = 'x'
else:
fitprobs = None
if plottype == 'pp':
ax.set_xlim(left=0, right=100)
ax.set_yscale(datascale)
fitlogs = 'y' if datascale == 'log' else None
# setup x, y, Axes for probabilities on the y
elif probax == 'y':
y, x = probvals, datavals
ax.set_xlabel(datalabel)
ax.set_ylabel(problabel)
if plottype == 'prob':
ax.set_yscale('prob', dist=dist)
fitprobs = 'y'
else:
fitprobs = None
if plottype == 'pp':
ax.set_ylim(bottom=0, top=100)
ax.set_xscale(datascale)
fitlogs = 'x' if datascale == 'log' else None
# finally plot the data
linestyle = scatter_kws.pop('linestyle', 'none')
marker = scatter_kws.pop('marker', 'o')
ax.plot(x, y, linestyle=linestyle, marker=marker, **scatter_kws)
# maybe do a best-fit and plot
if bestfit:
xhat, yhat, model = fit_line(x, y, xhat=sorted(x), dist=dist,
fitprobs=fitprobs, fitlogs=fitlogs,
estimate_ci=estimate_ci)
ax.plot(xhat, yhat, **line_kws)
if estimate_ci:
# for alpha, use half of existing or 0.5 * 0.5 = 0.25
# for zorder, use 1 less than existing or 1 - 1 = 0
opts = {
'facecolor': line_kws.get('color', 'k'),
'edgecolor': 'None',
'alpha': line_kws.get('alpha', 0.5) * 0.5,
'zorder': line_kws.get('zorder', 1) - 1,
'label': '95% conf. interval'
}
ax.fill_between(xhat, y1=model['yhat_hi'], y2=model['yhat_lo'],
**opts)
else:
xhat, yhat, model = (None, None, None)
# set the probability axes limits
if plottype == 'prob':
_set_prob_limits(ax, probax, len(probs))
# return the figure and maybe results of the best-fit
if return_best_fit_results:
results = dict(q=qntls, x=x, y=y, xhat=xhat, yhat=yhat, res=model)
return fig, results
else:
return fig
def plot_pos(data, postype=None, alpha=None, beta=None, exceedance=False):
"""
Compute the plotting positions for a dataset. Heavily borrows from
``scipy.stats.mstats.plotting_positions``.
A plottiting position is defined as: ``(i-alpha)/(n+1-alpha-beta)``
where:
- ``i`` is the rank order
- ``n`` is the size of the dataset
- ``alpha`` and ``beta`` are parameters used to adjust the
positions.
The values of ``alpha`` and ``beta`` can be explicitly set. Typical
values can also be access via the ``postype`` parameter. Available
``postype`` values (alpha, beta) are:
"type 4" (alpha=0, beta=1)
Linear interpolation of the empirical CDF.
"type 5" or "hazen" (alpha=0.5, beta=0.5)
Piecewise linear interpolation.
"type 6" or "weibull" (alpha=0, beta=0)
Weibull plotting positions. Unbiased exceedance probability
for all distributions. Recommended for hydrologic
applications.
"type 7" (alpha=1, beta=1)
The default values in R. Not recommended with probability
scales as the min and max data points get plotting positions
of 0 and 1, respectively, and therefore cannot be shown.
"type 8" (alpha=1/3, beta=1/3)
Approximately median-unbiased.
"type 9" or "blom" (alpha=0.375, beta=0.375)
Approximately unbiased positions if the data are normally
distributed.
"median" (alpha=0.3175, beta=0.3175)
Median exceedance probabilities for all distributions
(used in ``scipy.stats.probplot``).
"apl" or "pwm" (alpha=0.35, beta=0.35)
Used with probability-weighted moments.
"cunnane" (alpha=0.4, beta=0.4)
Nearly unbiased quantiles for normally distributed data.
This is the default value.
"gringorten" (alpha=0.44, beta=0.44)
Used for Gumble distributions.
Parameters
----------
data : array-like
The values whose plotting positions need to be computed.
postype : string, optional (default: "cunnane")
alpha, beta : float, optional
Custom plotting position parameters is the options available
through the `postype` parameter are insufficient.
exceedance : bool, optional (default: False)
Toggles "exceedance" vs "non-exceedance" probabilily plots.
By default, non-exceedance plots are drawn where the plot
generally slopes from the lower left to the upper right,
and show the probability that a new observation will be
less than a given point. By contrast, exceedance plots show
the probabilty that a new observation will be greater than
a given point.
Returns
-------
plot_pos : numpy.array
The computed plotting positions, sorted.
data_sorted : numpy.array
The original data values, sorted.
References
----------
http://artax.karlin.mff.cuni.cz/r-help/library/lmomco/html/pp.html
http://astrostatistics.psu.edu/su07/R/html/stats/html/quantile.html
http://docs.scipy.org/doc/scipy-0.17.0/reference/generated/scipy.stats.probplot.html
http://docs.scipy.org/doc/scipy-0.17.0/reference/generated/scipy.stats.mstats.plotting_positions.html
"""
pos_params = {
'type 4': (0, 1),
'type 5': (0.5, 0.5),
'type 6': (0, 0),
'type 7': (1, 1),
'type 8': (1.0 / 3.0, 1.0 / 3.0),
'type 9': (0.375, 0.375),
'weibull': (0, 0),
'median': (0.3175, 0.3175),
'apl': (0.35, 0.35),
'pwm': (0.35, 0.35),
'blom': (0.375, 0.375),
'hazen': (0.5, 0.5),
'cunnane': (0.4, 0.4),
'gringorten': (0.44, 0.44), # Gumble
}
postype = 'cunnane' if postype is None else postype
if alpha is None and beta is None:
alpha, beta = pos_params[postype.lower()]
data = numpy.asarray(data, dtype=float).flatten()
n = data.shape[0]
pos = numpy.empty_like(data)
pos[n:] = 0
sorted_index = data.argsort()
pos[sorted_index[:n]] = (
(numpy.arange(1.0, n + 1.0) - alpha) / (n + 1.0 - alpha - beta)
)
if exceedance:
return pos[sorted_index[::-1]], data[sorted_index]
return pos[sorted_index], data[sorted_index]
def _set_prob_limits(ax, probax, N):
""" Sets the limits of a probabilty axis based the number of point.
Parameters
----------
ax : matplotlib Axes
The Axes object that will be modified.
N : int
Maximum number of points for the series plotted on the Axes.
which : string
The axis whose ticklabels will be rotated. Valid values are 'x',
'y', or 'both'.
Returns
-------
None
"""
fig, ax = validate.axes_object(ax)
which = validate.axis_name(probax, 'probability axis')
if N <= 5:
minval = 10
elif N <= 10:
minval = 5
else:
minval = 10 ** (-1 * numpy.ceil(numpy.log10(N) - 2))
if which in ['x', 'both']:
ax.set_xlim(left=minval, right=100 - minval)
elif which in ['y', 'both']:
ax.set_ylim(bottom=minval, top=100 - minval)
def fit_line(x, y, xhat=None, fitprobs=None, fitlogs=None, dist=None,
estimate_ci=False, niter=10000, alpha=0.05):
"""
Fits a line to x-y data in various forms (linear, log, prob scales).
Parameters
----------
x, y : array-like
Independent and dependent data, respectively.
xhat : array-like, optional
The values at which ``yhat`` should should be estimated. If
not provided, falls back to the sorted values of ``x``.
fitprobs, fitlogs : str, optional.
Defines how data should be transformed. Valid values are
'x', 'y', or 'both'. If using ``fitprobs``, variables should
be expressed as a percentage, i.e.,
for a probablility transform, data will be transformed with
``lambda x: dist.ppf(x / 100.)``.
For a log transform, ``lambda x: numpy.log(x)``.
Take care to not pass the same value to both ``fitlogs`` and
``figprobs`` as both transforms will be applied.
dist : distribution, optional
A fully-spec'd scipy.stats distribution-like object
such that ``dist.ppf`` and ``dist.cdf`` can be called. If not
provided, defaults to a minimal implementation of
``scipt.stats.norm``.
estimate_ci : bool, optional (False)
Estimate and draw a confidence band around the best-fit line
using a percentile bootstrap.
niter : int, optional (default = 10000)
Number of bootstrap iterations if ``estimate_ci`` is provided.
alpha : float, optional (default = 0.05)
The confidence level of the bootstrap estimate.
Returns
-------
xhat, yhat : numpy arrays
Linear model estimates of ``x`` and ``y``.
results : dict
Dictionary of linear fit results. Keys include:
- slope
- intersept
- yhat_lo (lower confidence interval of the estimated y-vals)
- yhat_hi (upper confidence interval of the estimated y-vals)
"""
fitprobs = validate.fit_argument(fitprobs, "fitprobs")
fitlogs = validate.fit_argument(fitlogs, "fitlogs")
# maybe set xhat to default values
if xhat is None:
xhat = copy.copy(x)
# maybe set dist to default value
if dist is None:
dist = _minimal_norm
# maybe compute ppf of x
if fitprobs in ['x', 'both']:
x = dist.ppf(x / 100.)
xhat = dist.ppf(numpy.array(xhat) / 100.)
# maybe compute ppf of y
if fitprobs in ['y', 'both']:
y = dist.ppf(y / 100.)
# maybe compute log of x
if fitlogs in ['x', 'both']:
x = numpy.log(x)
# maybe compute log of y
if fitlogs in ['y', 'both']:
y = numpy.log(y)
yhat, results = algo._fit_simple(x, y, xhat, fitlogs=fitlogs)
if estimate_ci:
yhat_lo, yhat_hi = algo._bs_fit(x, y, xhat, fitlogs=fitlogs,
niter=niter, alpha=alpha)
else:
yhat_lo, yhat_hi = None, None
# maybe undo the ppf transform
if fitprobs in ['y', 'both']:
yhat = 100. * dist.cdf(yhat)
if yhat_lo is not None:
yhat_lo = 100. * dist.cdf(yhat_lo)
yhat_hi = 100. * dist.cdf(yhat_hi)
# maybe undo ppf transform
if fitprobs in ['x', 'both']:
xhat = 100. * dist.cdf(xhat)
results['yhat_lo'] = yhat_lo
results['yhat_hi'] = yhat_hi
return xhat, yhat, results