-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
_core.py
1477 lines (1169 loc) · 51.3 KB
/
_core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import warnings
import itertools
from copy import copy
from functools import partial
from collections.abc import Iterable, Sequence, Mapping
from numbers import Number
from datetime import datetime
from distutils.version import LooseVersion
import numpy as np
import pandas as pd
import matplotlib as mpl
from ._decorators import (
share_init_params_with_map,
)
from .palettes import (
QUAL_PALETTES,
color_palette,
cubehelix_palette,
_parse_cubehelix_args,
)
from .utils import (
get_color_cycle,
remove_na,
)
class SemanticMapping:
"""Base class for mapping data values to plot attributes."""
# -- Default attributes that all SemanticMapping subclasses must set
# Whether the mapping is numeric, categorical, or datetime
map_type = None
# Ordered list of unique values in the input data
levels = None
# A mapping from the data values to corresponding plot attributes
lookup_table = None
def __init__(self, plotter):
# TODO Putting this here so we can continue to use a lot of the
# logic that's built into the library, but the idea of this class
# is to move towards semantic mappings that are agnositic about the
# kind of plot they're going to be used to draw.
# Fully achieving that is going to take some thinking.
self.plotter = plotter
def map(cls, plotter, *args, **kwargs):
# This method is assigned the __init__ docstring
method_name = "_{}_map".format(cls.__name__[:-7].lower())
setattr(plotter, method_name, cls(plotter, *args, **kwargs))
return plotter
def _lookup_single(self, key):
"""Apply the mapping to a single data value."""
return self.lookup_table[key]
def __call__(self, key, *args, **kwargs):
"""Get the attribute(s) values for the data key."""
if isinstance(key, (list, np.ndarray, pd.Series)):
return [self._lookup_single(k, *args, **kwargs) for k in key]
else:
return self._lookup_single(key, *args, **kwargs)
@share_init_params_with_map
class HueMapping(SemanticMapping):
"""Mapping that sets artist colors according to data values."""
# A specification of the colors that should appear in the plot
palette = None
# An object that normalizes data values to [0, 1] range for color mapping
norm = None
# A continuous colormap object for interpolating in a numeric context
cmap = None
def __init__(
self, plotter, palette=None, order=None, norm=None,
):
"""Map the levels of the `hue` variable to distinct colors.
Parameters
----------
# TODO add generic parameters
"""
super().__init__(plotter)
data = plotter.plot_data.get("hue", pd.Series(dtype=float))
if data.notna().any():
map_type = self.infer_map_type(
palette, norm, plotter.input_format, plotter.var_types["hue"]
)
# Our goal is to end up with a dictionary mapping every unique
# value in `data` to a color. We will also keep track of the
# metadata about this mapping we will need for, e.g., a legend
# --- Option 1: numeric mapping with a matplotlib colormap
if map_type == "numeric":
data = pd.to_numeric(data)
levels, lookup_table, norm, cmap = self.numeric_mapping(
data, palette, norm,
)
# --- Option 2: categorical mapping using seaborn palette
elif map_type == "categorical":
cmap = norm = None
levels, lookup_table = self.categorical_mapping(
data, palette, order,
)
# --- Option 3: datetime mapping
else:
# TODO this needs actual implementation
cmap = norm = None
levels, lookup_table = self.categorical_mapping(
# Casting data to list to handle differences in the way
# pandas and numpy represent datetime64 data
list(data), palette, order,
)
self.map_type = map_type
self.lookup_table = lookup_table
self.palette = palette
self.levels = levels
self.norm = norm
self.cmap = cmap
def _lookup_single(self, key):
"""Get the color for a single value, using colormap to interpolate."""
try:
# Use a value that's in the original data vector
value = self.lookup_table[key]
except KeyError:
# Use the colormap to interpolate between existing datapoints
# (e.g. in the context of making a continuous legend)
try:
normed = self.norm(key)
except TypeError as err:
if np.isnan(key):
value = (0, 0, 0, 0)
else:
raise err
else:
if np.ma.is_masked(normed):
normed = np.nan
value = self.cmap(normed)
return value
def infer_map_type(self, palette, norm, input_format, var_type):
"""Determine how to implement the mapping."""
if palette in QUAL_PALETTES:
map_type = "categorical"
elif norm is not None:
map_type = "numeric"
elif isinstance(palette, (dict, list)):
map_type = "categorical"
elif input_format == "wide":
map_type = "categorical"
else:
map_type = var_type
return map_type
def categorical_mapping(self, data, palette, order):
"""Determine colors when the hue mapping is categorical."""
# -- Identify the order and name of the levels
levels = categorical_order(data, order)
n_colors = len(levels)
# -- Identify the set of colors to use
if isinstance(palette, dict):
missing = set(levels) - set(palette)
if any(missing):
err = "The palette dictionary is missing keys: {}"
raise ValueError(err.format(missing))
lookup_table = palette
else:
if palette is None:
if n_colors <= len(get_color_cycle()):
colors = color_palette(None, n_colors)
else:
colors = color_palette("husl", n_colors)
elif isinstance(palette, list):
if len(palette) != n_colors:
err = "The palette list has the wrong number of colors."
raise ValueError(err)
colors = palette
else:
colors = color_palette(palette, n_colors)
lookup_table = dict(zip(levels, colors))
return levels, lookup_table
def numeric_mapping(self, data, palette, norm):
"""Determine colors when the hue variable is quantitative."""
if isinstance(palette, dict):
# The presence of a norm object overrides a dictionary of hues
# in specifying a numeric mapping, so we need to process it here.
levels = list(sorted(palette))
colors = [palette[k] for k in sorted(palette)]
cmap = mpl.colors.ListedColormap(colors)
lookup_table = palette.copy()
else:
# The levels are the sorted unique values in the data
levels = list(np.sort(remove_na(data.unique())))
# --- Sort out the colormap to use from the palette argument
# Default numeric palette is our default cubehelix palette
# TODO do we want to do something complicated to ensure contrast?
palette = "ch:" if palette is None else palette
if isinstance(palette, mpl.colors.Colormap):
cmap = palette
elif str(palette).startswith("ch:"):
args, kwargs = _parse_cubehelix_args(palette)
cmap = cubehelix_palette(0, *args, as_cmap=True, **kwargs)
else:
try:
cmap = mpl.cm.get_cmap(palette)
except (ValueError, TypeError):
err = f"Palette {palette} not understood"
raise ValueError(err)
# Now sort out the data normalization
if norm is None:
norm = mpl.colors.Normalize()
elif isinstance(norm, tuple):
norm = mpl.colors.Normalize(*norm)
elif not isinstance(norm, mpl.colors.Normalize):
err = "``hue_norm`` must be None, tuple, or Normalize object."
raise ValueError(err)
if not norm.scaled():
norm(np.asarray(data.dropna()))
lookup_table = dict(zip(levels, cmap(norm(levels))))
return levels, lookup_table, norm, cmap
@share_init_params_with_map
class SizeMapping(SemanticMapping):
"""Mapping that sets artist sizes according to data values."""
# An object that normalizes data values to [0, 1] range
norm = None
def __init__(
self, plotter, sizes=None, order=None, norm=None,
):
"""Map the levels of the `size` variable to distinct values.
Parameters
----------
# TODO add generic parameters
"""
super().__init__(plotter)
data = plotter.plot_data.get("size", pd.Series(dtype=float))
if data.notna().any():
map_type = self.infer_map_type(
norm, sizes, plotter.var_types["size"]
)
# --- Option 1: numeric mapping
if map_type == "numeric":
levels, lookup_table, norm = self.numeric_mapping(
data, sizes, norm,
)
# --- Option 2: categorical mapping
elif map_type == "categorical":
levels, lookup_table = self.categorical_mapping(
data, sizes, order,
)
# --- Option 3: datetime mapping
# TODO this needs an actual implementation
else:
levels, lookup_table = self.categorical_mapping(
# Casting data to list to handle differences in the way
# pandas and numpy represent datetime64 data
list(data), sizes, order,
)
self.map_type = map_type
self.levels = levels
self.norm = norm
self.sizes = sizes
self.lookup_table = lookup_table
def infer_map_type(self, norm, sizes, var_type):
if norm is not None:
map_type = "numeric"
elif isinstance(sizes, (dict, list)):
map_type = "categorical"
else:
map_type = var_type
return map_type
def _lookup_single(self, key):
try:
value = self.lookup_table[key]
except KeyError:
normed = self.norm(key)
if np.ma.is_masked(normed):
normed = np.nan
size_values = self.lookup_table.values()
size_range = min(size_values), max(size_values)
value = size_range[0] + normed * np.ptp(size_range)
return value
def categorical_mapping(self, data, sizes, order):
levels = categorical_order(data, order)
if isinstance(sizes, dict):
# Dict inputs map existing data values to the size attribute
missing = set(levels) - set(sizes)
if any(missing):
err = f"Missing sizes for the following levels: {missing}"
raise ValueError(err)
lookup_table = sizes.copy()
elif isinstance(sizes, list):
# List inputs give size values in the same order as the levels
if len(sizes) != len(levels):
err = "The `sizes` list has the wrong number of values."
raise ValueError(err)
lookup_table = dict(zip(levels, sizes))
else:
if isinstance(sizes, tuple):
# Tuple input sets the min, max size values
if len(sizes) != 2:
err = "A `sizes` tuple must have only 2 values"
raise ValueError(err)
elif sizes is not None:
err = f"Value for `sizes` not understood: {sizes}"
raise ValueError(err)
else:
# Otherwise, we need to get the min, max size values from
# the plotter object we are attached to.
# TODO this is going to cause us trouble later, because we
# want to restructure things so that the plotter is generic
# across the visual representation of the data. But at this
# point, we don't know the visual representation. Likely we
# want to change the logic of this Mapping so that it gives
# points on a nornalized range that then gets unnormalized
# when we know what we're drawing. But given the way the
# package works now, this way is cleanest.
sizes = self.plotter._default_size_range
# For categorical sizes, use regularly-spaced linear steps
# between the minimum and maximum sizes. Then reverse the
# ramp so that the largest value is used for the first entry
# in size_order, etc. This is because "ordered" categoricals
# are often though to go in decreasing priority.
sizes = np.linspace(*sizes, len(levels))[::-1]
lookup_table = dict(zip(levels, sizes))
return levels, lookup_table
def numeric_mapping(self, data, sizes, norm):
if isinstance(sizes, dict):
# The presence of a norm object overrides a dictionary of sizes
# in specifying a numeric mapping, so we need to process it
# dictionary here
levels = list(np.sort(list(sizes)))
size_values = sizes.values()
size_range = min(size_values), max(size_values)
else:
# The levels here will be the unique values in the data
levels = list(np.sort(remove_na(data.unique())))
if isinstance(sizes, tuple):
# For numeric inputs, the size can be parametrized by
# the minimum and maximum artist values to map to. The
# norm object that gets set up next specifies how to
# do the mapping.
if len(sizes) != 2:
err = "A `sizes` tuple must have only 2 values"
raise ValueError(err)
size_range = sizes
elif sizes is not None:
err = f"Value for `sizes` not understood: {sizes}"
raise ValueError(err)
else:
# When not provided, we get the size range from the plotter
# object we are attached to. See the note in the categorical
# method about how this is suboptimal for future development.:
size_range = self.plotter._default_size_range
# Now that we know the minimum and maximum sizes that will get drawn,
# we need to map the data values that we have into that range. We will
# use a matplotlib Normalize class, which is typically used for numeric
# color mapping but works fine here too. It takes data values and maps
# them into a [0, 1] interval, potentially nonlinear-ly.
if norm is None:
# Default is a linear function between the min and max data values
norm = mpl.colors.Normalize()
elif isinstance(norm, tuple):
# It is also possible to give different limits in data space
norm = mpl.colors.Normalize(*norm)
elif not isinstance(norm, mpl.colors.Normalize):
err = f"Value for size `norm` parameter not understood: {norm}"
raise ValueError(err)
else:
# If provided with Normalize object, copy it so we can modify
norm = copy(norm)
# Set the mapping so all output values are in [0, 1]
norm.clip = True
# If the input range is not set, use the full range of the data
if not norm.scaled():
norm(levels)
# Map from data values to [0, 1] range
sizes_scaled = norm(levels)
# Now map from the scaled range into the artist units
if isinstance(sizes, dict):
lookup_table = sizes
else:
lo, hi = size_range
sizes = lo + sizes_scaled * (hi - lo)
lookup_table = dict(zip(levels, sizes))
return levels, lookup_table, norm
@share_init_params_with_map
class StyleMapping(SemanticMapping):
"""Mapping that sets artist style according to data values."""
# Style mapping is always treated as categorical
map_type = "categorical"
def __init__(
self, plotter, markers=None, dashes=None, order=None,
):
"""Map the levels of the `style` variable to distinct values.
Parameters
----------
# TODO add generic parameters
"""
super().__init__(plotter)
data = plotter.plot_data.get("style", pd.Series(dtype=float))
if data.notna().any():
# Cast to list to handle numpy/pandas datetime quirks
if variable_type(data) == "datetime":
data = list(data)
# Find ordered unique values
levels = categorical_order(data, order)
markers = self._map_attributes(
markers, levels, unique_markers(len(levels)), "markers",
)
dashes = self._map_attributes(
dashes, levels, unique_dashes(len(levels)), "dashes",
)
# Build the paths matplotlib will use to draw the markers
paths = {}
filled_markers = []
for k, m in markers.items():
if not isinstance(m, mpl.markers.MarkerStyle):
m = mpl.markers.MarkerStyle(m)
paths[k] = m.get_path().transformed(m.get_transform())
filled_markers.append(m.is_filled())
# Mixture of filled and unfilled markers will show line art markers
# in the edge color, which defaults to white. This can be handled,
# but there would be additional complexity with specifying the
# weight of the line art markers without overwhelming the filled
# ones with the edges. So for now, we will disallow mixtures.
if any(filled_markers) and not all(filled_markers):
err = "Filled and line art markers cannot be mixed"
raise ValueError(err)
lookup_table = {}
for key in levels:
lookup_table[key] = {}
if markers:
lookup_table[key]["marker"] = markers[key]
lookup_table[key]["path"] = paths[key]
if dashes:
lookup_table[key]["dashes"] = dashes[key]
self.levels = levels
self.lookup_table = lookup_table
def _lookup_single(self, key, attr=None):
"""Get attribute(s) for a given data point."""
if attr is None:
value = self.lookup_table[key]
else:
value = self.lookup_table[key][attr]
return value
def _map_attributes(self, arg, levels, defaults, attr):
"""Handle the specification for a given style attribute."""
if arg is True:
lookup_table = dict(zip(levels, defaults))
elif isinstance(arg, dict):
missing = set(levels) - set(arg)
if missing:
err = f"These `{attr}` levels are missing values: {missing}"
raise ValueError(err)
lookup_table = arg
elif isinstance(arg, Sequence):
if len(levels) != len(arg):
err = f"The `{attr}` argument has the wrong number of values"
raise ValueError(err)
lookup_table = dict(zip(levels, arg))
elif arg:
err = f"This `{attr}` argument was not understood: {arg}"
raise ValueError(err)
else:
lookup_table = {}
return lookup_table
# =========================================================================== #
class VectorPlotter:
"""Base class for objects underlying *plot functions."""
_semantic_mappings = {
"hue": HueMapping,
"size": SizeMapping,
"style": StyleMapping,
}
# TODO units is another example of a non-mapping "semantic"
# we need a general name for this and separate handling
semantics = "x", "y", "hue", "size", "style", "units"
wide_structure = {
"x": "@index", "y": "@values", "hue": "@columns", "style": "@columns",
}
flat_structure = {"x": "@index", "y": "@values"}
_default_size_range = 1, 2 # Unused but needed in tests, ugh
def __init__(self, data=None, variables={}):
self.assign_variables(data, variables)
for var, cls in self._semantic_mappings.items():
# Create the mapping function
map_func = partial(cls.map, plotter=self)
setattr(self, f"map_{var}", map_func)
# Call the mapping function to initialize with default values
getattr(self, f"map_{var}")()
self._var_levels = {}
@classmethod
def get_semantics(cls, kwargs, semantics=None):
"""Subset a dictionary` arguments with known semantic variables."""
# TODO this should be get_variables since we have included x and y
if semantics is None:
semantics = cls.semantics
variables = {}
for key, val in kwargs.items():
if key in semantics and val is not None:
variables[key] = val
return variables
@property
def has_xy_data(self):
"""Return True at least one of x or y is defined."""
return bool({"x", "y"} & set(self.variables))
@property
def var_levels(self):
"""Property interface to ordered list of variables levels.
Each time it's accessed, it updates the var_levels dictionary with the
list of levels in the current semantic mappers. But it also allows the
dictionary to persist, so it can be used to set levels by a key. This is
used to track the list of col/row levels using an attached FacetGrid
object, but it's kind of messy and ideally fixed by improving the
faceting logic so it interfaces better with the modern approach to
tracking plot variables.
"""
for var in self.variables:
try:
map_obj = getattr(self, f"_{var}_map")
self._var_levels[var] = map_obj.levels
except AttributeError:
pass
return self._var_levels
def assign_variables(self, data=None, variables={}):
"""Define plot variables, optionally using lookup from `data`."""
x = variables.get("x", None)
y = variables.get("y", None)
if x is None and y is None:
self.input_format = "wide"
plot_data, variables = self._assign_variables_wideform(
data, **variables,
)
else:
self.input_format = "long"
plot_data, variables = self._assign_variables_longform(
data, **variables,
)
self.plot_data = plot_data
self.variables = variables
self.var_types = {
v: variable_type(
plot_data[v],
boolean_type="numeric" if v in "xy" else "categorical"
)
for v in variables
}
return self
def _assign_variables_wideform(self, data=None, **kwargs):
"""Define plot variables given wide-form data.
Parameters
----------
data : flat vector or collection of vectors
Data can be a vector or mapping that is coerceable to a Series
or a sequence- or mapping-based collection of such vectors, or a
rectangular numpy array, or a Pandas DataFrame.
kwargs : variable -> data mappings
Behavior with keyword arguments is currently undefined.
Returns
-------
plot_data : :class:`pandas.DataFrame`
Long-form data object mapping seaborn variables (x, y, hue, ...)
to data vectors.
variables : dict
Keys are defined seaborn variables; values are names inferred from
the inputs (or None when no name can be determined).
"""
# Raise if semantic or other variables are assigned in wide-form mode
assigned = [k for k, v in kwargs.items() if v is not None]
if any(assigned):
s = "s" if len(assigned) > 1 else ""
err = f"The following variable{s} cannot be assigned with wide-form data: "
err += ", ".join(f"`{v}`" for v in assigned)
raise ValueError(err)
# Determine if the data object actually has any data in it
empty = data is None or not len(data)
# Then, determine if we have "flat" data (a single vector)
if isinstance(data, dict):
values = data.values()
else:
values = np.atleast_1d(np.asarray(data, dtype=object))
flat = not any(
isinstance(v, Iterable) and not isinstance(v, (str, bytes))
for v in values
)
if empty:
# Make an object with the structure of plot_data, but empty
plot_data = pd.DataFrame()
variables = {}
elif flat:
# Handle flat data by converting to pandas Series and using the
# index and/or values to define x and/or y
# (Could be accomplished with a more general to_series() interface)
flat_data = pd.Series(data).copy()
names = {
"@values": flat_data.name,
"@index": flat_data.index.name
}
plot_data = {}
variables = {}
for var in ["x", "y"]:
if var in self.flat_structure:
attr = self.flat_structure[var]
plot_data[var] = getattr(flat_data, attr[1:])
variables[var] = names[self.flat_structure[var]]
plot_data = pd.DataFrame(plot_data)
else:
# Otherwise assume we have some collection of vectors.
# Handle Python sequences such that entries end up in the columns,
# not in the rows, of the intermediate wide DataFrame.
# One way to accomplish this is to convert to a dict of Series.
if isinstance(data, Sequence):
data_dict = {}
for i, var in enumerate(data):
key = getattr(var, "name", i)
# TODO is there a safer/more generic way to ensure Series?
# sort of like np.asarray, but for pandas?
data_dict[key] = pd.Series(var)
data = data_dict
# Pandas requires that dict values either be Series objects
# or all have the same length, but we want to allow "ragged" inputs
if isinstance(data, Mapping):
data = {key: pd.Series(val) for key, val in data.items()}
# Otherwise, delegate to the pandas DataFrame constructor
# This is where we'd prefer to use a general interface that says
# "give me this data as a pandas DataFrame", so we can accept
# DataFrame objects from other libraries
wide_data = pd.DataFrame(data, copy=True)
# At this point we should reduce the dataframe to numeric cols
numeric_cols = wide_data.apply(variable_type) == "numeric"
wide_data = wide_data.loc[:, numeric_cols]
# Now melt the data to long form
melt_kws = {"var_name": "@columns", "value_name": "@values"}
use_index = "@index" in self.wide_structure.values()
if use_index:
melt_kws["id_vars"] = "@index"
try:
orig_categories = wide_data.columns.categories
orig_ordered = wide_data.columns.ordered
wide_data.columns = wide_data.columns.add_categories("@index")
except AttributeError:
category_columns = False
else:
category_columns = True
wide_data["@index"] = wide_data.index.to_series()
plot_data = wide_data.melt(**melt_kws)
if use_index and category_columns:
plot_data["@columns"] = pd.Categorical(plot_data["@columns"],
orig_categories,
orig_ordered)
# Assign names corresponding to plot semantics
for var, attr in self.wide_structure.items():
plot_data[var] = plot_data[attr]
# Define the variable names
variables = {}
for var, attr in self.wide_structure.items():
obj = getattr(wide_data, attr[1:])
variables[var] = getattr(obj, "name", None)
# Remove redundant columns from plot_data
plot_data = plot_data[list(variables)]
return plot_data, variables
def _assign_variables_longform(self, data=None, **kwargs):
"""Define plot variables given long-form data and/or vector inputs.
Parameters
----------
data : dict-like collection of vectors
Input data where variable names map to vector values.
kwargs : variable -> data mappings
Keys are seaborn variables (x, y, hue, ...) and values are vectors
in any format that can construct a :class:`pandas.DataFrame` or
names of columns or index levels in ``data``.
Returns
-------
plot_data : :class:`pandas.DataFrame`
Long-form data object mapping seaborn variables (x, y, hue, ...)
to data vectors.
variables : dict
Keys are defined seaborn variables; values are names inferred from
the inputs (or None when no name can be determined).
Raises
------
ValueError
When variables are strings that don't appear in ``data``.
"""
plot_data = {}
variables = {}
# Data is optional; all variables can be defined as vectors
if data is None:
data = {}
# TODO should we try a data.to_dict() or similar here to more
# generally accept objects with that interface?
# Note that dict(df) also works for pandas, and gives us what we
# want, whereas DataFrame.to_dict() gives a nested dict instead of
# a dict of series.
# Variables can also be extraced from the index attribute
# TODO is this the most general way to enable it?
# There is no index.to_dict on multiindex, unfortunately
try:
index = data.index.to_frame()
except AttributeError:
index = {}
# The caller will determine the order of variables in plot_data
for key, val in kwargs.items():
if isinstance(val, (str, bytes)):
# String inputs trigger __getitem__
if val in data:
# First try to get an entry in the data object
plot_data[key] = data[val]
variables[key] = val
elif val in index:
# Failing that, try to get an entry in the index object
plot_data[key] = index[val]
variables[key] = val
else:
# We don't know what this name means
err = f"Could not interpret value `{val}` for parameter `{key}`"
raise ValueError(err)
else:
# Otherwise, assume the value is itself a vector of data
# Raise when data is present and a vector can't be combined with it
if isinstance(data, pd.DataFrame) and not isinstance(val, pd.Series):
if val is not None and len(data) != len(val):
val_cls = val.__class__.__name__
err = (
f"Length of {val_cls} vectors must match length of `data`"
f" when both are used, but `data` has length {len(data)}"
f" and the vector passed to `{key}` has length {len(val)}."
)
raise ValueError(err)
plot_data[key] = val
# Try to infer the name of the variable
variables[key] = getattr(val, "name", None)
# Construct a tidy plot DataFrame. This will convert a number of
# types automatically, aligning on index in case of pandas objects
plot_data = pd.DataFrame(plot_data)
# Reduce the variables dictionary to fields with valid data
variables = {
var: name
for var, name in variables.items()
if plot_data[var].notnull().any()
}
return plot_data, variables
def iter_data(
self, grouping_vars=None, reverse=False, from_comp_data=False,
):
"""Generator for getting subsets of data defined by semantic variables.
Also injects "col" and "row" into grouping semantics.
Parameters
----------
grouping_vars : string or list of strings
Semantic variables that define the subsets of data.
reverse : bool, optional
If True, reverse the order of iteration.
from_comp_data : bool, optional
If True, use self.comp_data rather than self.plot_data
Yields
------
sub_vars : dict
Keys are semantic names, values are the level of that semantic.
sub_data : :class:`pandas.DataFrame`
Subset of ``plot_data`` for this combination of semantic values.
"""
# TODO should this default to using all (non x/y?) semantics?
# or define groupping vars somewhere?
if grouping_vars is None:
grouping_vars = []
elif isinstance(grouping_vars, str):
grouping_vars = [grouping_vars]
elif isinstance(grouping_vars, tuple):
grouping_vars = list(grouping_vars)
# Always insert faceting variables
facet_vars = {"col", "row"}
grouping_vars.extend(
facet_vars & set(self.variables) - set(grouping_vars)
)
# Reduce to the semantics used in this plot
grouping_vars = [
var for var in grouping_vars if var in self.variables
]
if from_comp_data:
data = self.comp_data
else:
data = self.plot_data
if grouping_vars:
grouped_data = data.groupby(
grouping_vars, sort=False, as_index=False
)
grouping_keys = []
for var in grouping_vars:
grouping_keys.append(self.var_levels.get(var, []))
iter_keys = itertools.product(*grouping_keys)
if reverse:
iter_keys = reversed(list(iter_keys))
for key in iter_keys:
# Pandas fails with singleton tuple inputs
pd_key = key[0] if len(key) == 1 else key
try:
data_subset = grouped_data.get_group(pd_key)