/
dataframe.py
3144 lines (2871 loc) · 105 KB
/
dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""Module houses ``DataFrame`` class, that is distributed version of ``pandas.DataFrame``."""
from __future__ import annotations
import pandas
from pandas.core.common import apply_if_callable
from pandas.core.dtypes.common import (
infer_dtype_from_object,
is_dict_like,
is_list_like,
is_numeric_dtype,
)
from pandas.core.indexes.frozen import FrozenList
from pandas.util._validators import validate_bool_kwarg
from pandas.io.formats.info import DataFrameInfo
from pandas._libs.lib import no_default, NoDefault
from pandas._typing import (
CompressionOptions,
WriteBuffer,
FilePath,
StorageOptions,
)
import datetime
import re
import itertools
import functools
import numpy as np
import sys
from typing import IO, Optional, Union, Iterator, Hashable, Sequence
import warnings
from modin.pandas import Categorical
from modin.error_message import ErrorMessage
from modin.utils import (
_inherit_docstrings,
to_pandas,
hashable,
MODIN_UNNAMED_SERIES_LABEL,
try_cast_to_pandas,
)
from modin.config import IsExperimental, PersistentPickle
from .utils import (
from_pandas,
from_non_pandas,
broadcast_item,
cast_function_modin2pandas,
SET_DATAFRAME_ATTRIBUTE_WARNING,
)
from .iterator import PartitionIterator
from .series import Series
from .base import BasePandasDataset, _ATTRS_NO_LOOKUP
from .groupby import DataFrameGroupBy
from .accessor import CachedAccessor, SparseFrameAccessor
@_inherit_docstrings(
pandas.DataFrame, excluded=[pandas.DataFrame.__init__], apilink="pandas.DataFrame"
)
class DataFrame(BasePandasDataset):
"""
Modin distributed representation of ``pandas.DataFrame``.
Internally, the data can be divided into partitions along both columns and rows
in order to parallelize computations and utilize the user's hardware as much as possible.
Inherit common for ``DataFrame``-s and ``Series`` functionality from the
`BasePandasDataset` class.
Parameters
----------
data : DataFrame, Series, pandas.DataFrame, ndarray, Iterable or dict, optional
Dict can contain ``Series``, arrays, constants, dataclass or list-like objects.
If data is a dict, column order follows insertion-order.
index : Index or array-like, optional
Index to use for resulting frame. Will default to ``RangeIndex`` if no
indexing information part of input data and no index provided.
columns : Index or array-like, optional
Column labels to use for resulting frame. Will default to
``RangeIndex`` if no column labels are provided.
dtype : str, np.dtype, or pandas.ExtensionDtype, optional
Data type to force. Only a single dtype is allowed. If None, infer.
copy : bool, default: False
Copy data from inputs. Only affects ``pandas.DataFrame`` / 2d ndarray input.
query_compiler : BaseQueryCompiler, optional
A query compiler object to create the ``DataFrame`` from.
Notes
-----
``DataFrame`` can be created either from passed `data` or `query_compiler`. If both
parameters are provided, data source will be prioritized in the next order:
1) Modin ``DataFrame`` or ``Series`` passed with `data` parameter.
2) Query compiler from the `query_compiler` parameter.
3) Various pandas/NumPy/Python data structures passed with `data` parameter.
The last option is less desirable since import of such data structures is very
inefficient, please use previously created Modin structures from the fist two
options or import data using highly efficient Modin IO tools (for example
``pd.read_csv``).
"""
_pandas_class = pandas.DataFrame
def __init__(
self,
data=None,
index=None,
columns=None,
dtype=None,
copy=None,
query_compiler=None,
):
from modin.numpy import array
# Siblings are other dataframes that share the same query compiler. We
# use this list to update inplace when there is a shallow copy.
self._siblings = []
if isinstance(data, (DataFrame, Series)):
self._query_compiler = data._query_compiler.copy()
if index is not None and any(i not in data.index for i in index):
raise NotImplementedError(
"Passing non-existant columns or index values to constructor not"
+ " yet implemented."
)
if isinstance(data, Series):
# We set the column name if it is not in the provided Series
if data.name is None:
self.columns = [0] if columns is None else columns
# If the columns provided are not in the named Series, pandas clears
# the DataFrame and sets columns to the columns provided.
elif columns is not None and data.name not in columns:
self._query_compiler = from_pandas(
pandas.DataFrame(columns=columns)
)._query_compiler
if index is not None:
self._query_compiler = data.loc[index]._query_compiler
elif columns is None and index is None:
data._add_sibling(self)
else:
if columns is not None and any(i not in data.columns for i in columns):
raise NotImplementedError(
"Passing non-existant columns or index values to constructor not"
+ " yet implemented."
)
if index is None:
index = slice(None)
if columns is None:
columns = slice(None)
self._query_compiler = data.loc[index, columns]._query_compiler
elif isinstance(data, array):
self._query_compiler = data._query_compiler.copy()
if copy is not None and not copy:
data._add_sibling(self)
if columns is not None and not isinstance(columns, pandas.Index):
columns = pandas.Index(columns)
if columns is not None:
self.set_axis(columns, axis=1, inplace=True)
if index is not None:
self.set_axis(index, axis=0, inplace=True)
if dtype is not None:
casted_obj = self.astype(dtype, copy=False)
self._query_compiler = casted_obj._query_compiler
# Check type of data and use appropriate constructor
elif query_compiler is None:
distributed_frame = from_non_pandas(data, index, columns, dtype)
if distributed_frame is not None:
self._query_compiler = distributed_frame._query_compiler
return
warnings.warn(
"Distributing {} object. This may take some time.".format(type(data))
)
if isinstance(data, pandas.Index):
pass
elif (
is_list_like(data)
and not is_dict_like(data)
and not isinstance(data, np.ndarray)
):
old_dtype = getattr(data, "dtype", None)
values = [
obj._to_pandas() if isinstance(obj, Series) else obj for obj in data
]
try:
data = type(data)(values, dtype=old_dtype)
except TypeError:
data = values
elif is_dict_like(data) and not isinstance(
data, (pandas.Series, Series, pandas.DataFrame, DataFrame)
):
if columns is not None:
data = {key: value for key, value in data.items() if key in columns}
if len(data) and all(isinstance(v, Series) for v in data.values()):
from .general import concat
new_qc = concat(
data.values(), axis=1, keys=data.keys()
)._query_compiler
if dtype is not None:
new_qc = new_qc.astype({col: dtype for col in new_qc.columns})
if index is not None:
new_qc = new_qc.reindex(axis=0, labels=index)
if columns is not None:
new_qc = new_qc.reindex(axis=1, labels=columns)
self._query_compiler = new_qc
return
data = {
k: v._to_pandas() if isinstance(v, Series) else v
for k, v in data.items()
}
pandas_df = pandas.DataFrame(
data=data, index=index, columns=columns, dtype=dtype, copy=copy
)
self._query_compiler = from_pandas(pandas_df)._query_compiler
else:
self._query_compiler = query_compiler
def __repr__(self):
"""
Return a string representation for a particular ``DataFrame``.
Returns
-------
str
"""
num_rows = pandas.get_option("display.max_rows") or len(self.index)
num_cols = pandas.get_option("display.max_columns") or len(self.columns)
result = repr(self._build_repr_df(num_rows, num_cols))
if len(self.index) > num_rows or len(self.columns) > num_cols:
# The split here is so that we don't repr pandas row lengths.
return result.rsplit("\n\n", 1)[0] + "\n\n[{0} rows x {1} columns]".format(
len(self.index), len(self.columns)
)
else:
return result
def _repr_html_(self): # pragma: no cover
"""
Return a html representation for a particular ``DataFrame``.
Returns
-------
str
"""
num_rows = pandas.get_option("display.max_rows") or 60
num_cols = pandas.get_option("display.max_columns") or 20
# We use pandas _repr_html_ to get a string of the HTML representation
# of the dataframe.
result = self._build_repr_df(num_rows, num_cols)._repr_html_()
if len(self.index) > num_rows or len(self.columns) > num_cols:
# We split so that we insert our correct dataframe dimensions.
return result.split("<p>")[
0
] + "<p>{0} rows x {1} columns</p>\n</div>".format(
len(self.index), len(self.columns)
)
else:
return result
def _get_columns(self):
"""
Get the columns for this ``DataFrame``.
Returns
-------
pandas.Index
The union of all indexes across the partitions.
"""
return self._query_compiler.columns
def _set_columns(self, new_columns):
"""
Set the columns for this ``DataFrame``.
Parameters
----------
new_columns : list-like, Index
The new index to set.
"""
self._query_compiler.columns = new_columns
columns = property(_get_columns, _set_columns)
@property
def ndim(self): # noqa: RT01, D200
"""
Return the number of dimensions of the underlying data, by definition 2.
"""
return 2
def drop_duplicates(
self, subset=None, keep="first", inplace=False, ignore_index=False
): # noqa: PR01, RT01, D200
"""
Return ``DataFrame`` with duplicate rows removed.
"""
return super(DataFrame, self).drop_duplicates(
subset=subset, keep=keep, inplace=inplace, ignore_index=ignore_index
)
@property
def dtypes(self): # noqa: RT01, D200
"""
Return the dtypes in the ``DataFrame``.
"""
return self._query_compiler.dtypes
def duplicated(self, subset=None, keep="first"): # noqa: PR01, RT01, D200
"""
Return boolean ``Series`` denoting duplicate rows.
"""
df = self[subset] if subset is not None else self
new_qc = df._query_compiler.duplicated(keep=keep)
duplicates = self._reduce_dimension(new_qc)
return duplicates
@property
def empty(self): # noqa: RT01, D200
"""
Indicate whether ``DataFrame`` is empty.
"""
return len(self.columns) == 0 or len(self.index) == 0
@property
def axes(self): # noqa: RT01, D200
"""
Return a list representing the axes of the ``DataFrame``.
"""
return [self.index, self.columns]
@property
def shape(self): # noqa: RT01, D200
"""
Return a tuple representing the dimensionality of the ``DataFrame``.
"""
return len(self.index), len(self.columns)
def add_prefix(self, prefix): # noqa: PR01, RT01, D200
"""
Prefix labels with string `prefix`.
"""
return self.__constructor__(
query_compiler=self._query_compiler.add_prefix(prefix)
)
def add_suffix(self, suffix): # noqa: PR01, RT01, D200
"""
Suffix labels with string `suffix`.
"""
return self.__constructor__(
query_compiler=self._query_compiler.add_suffix(suffix)
)
def applymap(self, func, na_action: Optional[str] = None, **kwargs):
if not callable(func):
raise ValueError("'{0}' object is not callable".format(type(func)))
return self.__constructor__(
query_compiler=self._query_compiler.applymap(
func, na_action=na_action, **kwargs
)
)
def apply(
self, func, axis=0, raw=False, result_type=None, args=(), **kwargs
): # noqa: PR01, RT01, D200
"""
Apply a function along an axis of the ``DataFrame``.
"""
func = cast_function_modin2pandas(func)
axis = self._get_axis_number(axis)
query_compiler = super(DataFrame, self).apply(
func,
axis=axis,
broadcast=None,
raw=raw,
reduce=None,
result_type=result_type,
convert_dtype=None,
args=args,
**kwargs,
)
if not isinstance(query_compiler, type(self._query_compiler)):
# A scalar was returned
return query_compiler
if result_type == "reduce":
output_type = Series
elif result_type == "broadcast":
output_type = DataFrame
# the 'else' branch also handles 'result_type == "expand"' since it makes the output type
# depend on the `func` result (Series for a scalar, DataFrame for list-like)
else:
reduced_index = pandas.Index([MODIN_UNNAMED_SERIES_LABEL])
if query_compiler.get_axis(axis).equals(
reduced_index
) or query_compiler.get_axis(axis ^ 1).equals(reduced_index):
output_type = Series
else:
output_type = DataFrame
return output_type(query_compiler=query_compiler)
def groupby(
self,
by=None,
axis=0,
level=None,
as_index=True,
sort=True,
group_keys=no_default,
squeeze: bool = no_default,
observed=False,
dropna: bool = True,
): # noqa: PR01, RT01, D200
"""
Group ``DataFrame`` using a mapper or by a ``Series`` of columns.
"""
if squeeze is not no_default:
warnings.warn(
(
"The `squeeze` parameter is deprecated and "
+ "will be removed in a future version."
),
FutureWarning,
stacklevel=2,
)
else:
squeeze = False
axis = self._get_axis_number(axis)
idx_name = None
# Drop here indicates whether or not to drop the data column before doing the
# groupby. The typical pandas behavior is to drop when the data came from this
# dataframe. When a string, Series directly from this dataframe, or list of
# strings is passed in, the data used for the groupby is dropped before the
# groupby takes place.
drop = False
if (
not isinstance(by, (pandas.Series, Series))
and is_list_like(by)
and len(by) == 1
):
by = by[0]
if callable(by):
by = self.index.map(by)
elif hashable(by) and not isinstance(by, (pandas.Grouper, FrozenList)):
drop = by in self.columns
idx_name = by
if by is not None and by in self._query_compiler.get_index_names(axis):
# In this case we pass the string value of the name through to the
# partitions. This is more efficient than broadcasting the values.
level, by = by, None
elif level is None:
by = self.__getitem__(by)._query_compiler
elif isinstance(by, Series):
drop = by._parent is self
idx_name = by.name
by = by._query_compiler
elif isinstance(by, pandas.Grouper):
drop = by.key in self
elif is_list_like(by):
# fastpath for multi column groupby
if axis == 0 and all(
(
(hashable(o) and (o in self))
or isinstance(o, Series)
or (isinstance(o, pandas.Grouper) and o.key in self)
or (is_list_like(o) and len(o) == len(self.axes[axis]))
)
for o in by
):
has_external = False
processed_by = []
for current_by in by:
if isinstance(current_by, pandas.Grouper):
processed_by.append(current_by)
has_external = True
elif hashable(current_by):
processed_by.append(current_by)
elif isinstance(current_by, Series):
if current_by._parent is self:
processed_by.append(current_by.name)
else:
processed_by.append(current_by._query_compiler)
has_external = True
else:
has_external = True
processed_by.append(current_by)
by = processed_by
if not has_external:
by = self[processed_by]._query_compiler
drop = True
else:
mismatch = len(by) != len(self.axes[axis])
if mismatch and all(
hashable(obj)
and (
obj in self or obj in self._query_compiler.get_index_names(axis)
)
for obj in by
):
# In the future, we will need to add logic to handle this, but for now
# we default to pandas in this case.
pass
elif mismatch and any(
hashable(obj) and obj not in self.columns for obj in by
):
names = [o.name if isinstance(o, Series) else o for o in by]
raise KeyError(next(x for x in names if x not in self))
return DataFrameGroupBy(
self,
by,
axis,
level,
as_index,
sort,
group_keys,
squeeze,
idx_name,
observed=observed,
drop=drop,
dropna=dropna,
)
def keys(self): # noqa: RT01, D200
"""
Get columns of the ``DataFrame``.
"""
return self.columns
def transpose(self, copy=False, *args): # noqa: PR01, RT01, D200
"""
Transpose index and columns.
"""
# FIXME: Judging by pandas docs `*args` serves only compatibility purpose
# and does not affect the result, we shouldn't pass it to the query compiler.
return self.__constructor__(
query_compiler=self._query_compiler.transpose(*args)
)
T = property(transpose)
def add(
self, other, axis="columns", level=None, fill_value=None
): # noqa: PR01, RT01, D200
"""
Get addition of ``DataFrame`` and `other`, element-wise (binary operator `add`).
"""
return self._binary_op(
"add",
other,
axis=axis,
level=level,
fill_value=fill_value,
broadcast=isinstance(other, Series),
)
def append(
self, other, ignore_index=False, verify_integrity=False, sort=False
): # noqa: PR01, RT01, D200
"""
Append rows of `other` to the end of caller, returning a new object.
"""
if sort is False:
warnings.warn(
"Due to https://github.com/pandas-dev/pandas/issues/35092, "
+ "Pandas ignores sort=False; Modin correctly does not sort."
)
if isinstance(other, (Series, dict)):
if isinstance(other, dict):
other = Series(other)
if other.name is None and not ignore_index:
raise TypeError(
"Can only append a Series if ignore_index=True"
+ " or if the Series has a name"
)
if other.name is not None:
# other must have the same index name as self, otherwise
# index name will be reset
name = other.name
# We must transpose here because a Series becomes a new row, and the
# structure of the query compiler is currently columnar
other = other._query_compiler.transpose()
other.index = pandas.Index([name], name=self.index.name)
else:
# See note above about transpose
other = other._query_compiler.transpose()
elif isinstance(other, list):
if not all(isinstance(o, BasePandasDataset) for o in other):
other = self.__constructor__(pandas.DataFrame(other))._query_compiler
else:
other = [obj._query_compiler for obj in other]
else:
other = other._query_compiler
# If ignore_index is False, by definition the Index will be correct.
# We also do this first to ensure that we don't waste compute/memory.
if verify_integrity and not ignore_index:
appended_index = (
self.index.append(other.index)
if not isinstance(other, list)
else self.index.append([o.index for o in other])
)
is_valid = next((False for idx in appended_index.duplicated() if idx), True)
if not is_valid:
raise ValueError(
"Indexes have overlapping values: {}".format(
appended_index[appended_index.duplicated()]
)
)
query_compiler = self._query_compiler.concat(
0, other, ignore_index=ignore_index, sort=sort
)
return self.__constructor__(query_compiler=query_compiler)
def assign(self, **kwargs): # noqa: PR01, RT01, D200
"""
Assign new columns to a ``DataFrame``.
"""
df = self.copy()
for k, v in kwargs.items():
if callable(v):
df[k] = v(df)
else:
df[k] = v
return df
def boxplot(
self,
column=None,
by=None,
ax=None,
fontsize=None,
rot=0,
grid=True,
figsize=None,
layout=None,
return_type=None,
backend=None,
**kwargs,
): # noqa: PR01, RT01, D200
"""
Make a box plot from ``DataFrame`` columns.
"""
return to_pandas(self).boxplot(
column=column,
by=by,
ax=ax,
fontsize=fontsize,
rot=rot,
grid=grid,
figsize=figsize,
layout=layout,
return_type=return_type,
backend=backend,
**kwargs,
)
def combine(
self, other, func, fill_value=None, overwrite=True
): # noqa: PR01, RT01, D200
"""
Perform column-wise combine with another ``DataFrame``.
"""
return super(DataFrame, self).combine(
other, func, fill_value=fill_value, overwrite=overwrite
)
def compare(
self,
other,
align_axis=1,
keep_shape: bool = False,
keep_equal: bool = False,
result_names=("self", "other"),
) -> "DataFrame": # noqa: PR01, RT01, D200
"""
Compare to another ``DataFrame`` and show the differences.
"""
if not isinstance(other, DataFrame):
raise TypeError(f"Cannot compare DataFrame to {type(other)}")
other = self._validate_other(other, 0, compare_index=True)
return self.__constructor__(
query_compiler=self._query_compiler.compare(
other,
align_axis=align_axis,
keep_shape=keep_shape,
keep_equal=keep_equal,
result_names=result_names,
)
)
def corr(
self, method="pearson", min_periods=1, numeric_only=no_default
): # noqa: PR01, RT01, D200
"""
Compute pairwise correlation of columns, excluding NA/null values.
"""
if not numeric_only:
return self._default_to_pandas(
pandas.DataFrame.corr,
method=method,
min_periods=min_periods,
numeric_only=numeric_only,
)
return self.__constructor__(
query_compiler=self._query_compiler.corr(
method=method,
min_periods=min_periods,
)
)
def corrwith(
self, other, axis=0, drop=False, method="pearson", numeric_only=no_default
): # noqa: PR01, RT01, D200
"""
Compute pairwise correlation.
"""
if not isinstance(other, (Series, DataFrame)):
raise TypeError(f"unsupported type: {type(other)}")
return self.__constructor__(
query_compiler=self._query_compiler.corrwith(
other=other._query_compiler,
axis=axis,
drop=drop,
method=method,
numeric_only=numeric_only,
)
)
def cov(
self, min_periods=None, ddof: int = 1, numeric_only=no_default
): # noqa: PR01, RT01, D200
"""
Compute pairwise covariance of columns, excluding NA/null values.
"""
if not numeric_only:
return self._default_to_pandas(
pandas.DataFrame.cov,
min_periods=min_periods,
ddof=ddof,
numeric_only=numeric_only,
)
cov_df = self.drop(
columns=[
i for i in self.dtypes.index if not is_numeric_dtype(self.dtypes[i])
]
)
if min_periods is not None and min_periods > len(cov_df):
result = np.empty((cov_df.shape[1], cov_df.shape[1]))
result.fill(np.nan)
return cov_df.__constructor__(result)
return cov_df.__constructor__(
query_compiler=cov_df._query_compiler.cov(
min_periods=min_periods, ddof=ddof
)
)
def dot(self, other): # noqa: PR01, RT01, D200
"""
Compute the matrix multiplication between the ``DataFrame`` and `other`.
"""
if isinstance(other, BasePandasDataset):
common = self.columns.union(other.index)
if len(common) > len(self.columns) or len(common) > len(other.index):
raise ValueError("Matrices are not aligned")
qc = other.reindex(index=common)._query_compiler
if isinstance(other, DataFrame):
return self.__constructor__(
query_compiler=self._query_compiler.dot(
qc, squeeze_self=False, squeeze_other=False
)
)
else:
return self._reduce_dimension(
query_compiler=self._query_compiler.dot(
qc, squeeze_self=False, squeeze_other=True
)
)
other = np.asarray(other)
if self.shape[1] != other.shape[0]:
raise ValueError(
"Dot product shape mismatch, {} vs {}".format(self.shape, other.shape)
)
if len(other.shape) > 1:
return self.__constructor__(
query_compiler=self._query_compiler.dot(other, squeeze_self=False)
)
return self._reduce_dimension(
query_compiler=self._query_compiler.dot(other, squeeze_self=False)
)
def eq(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200
"""
Perform equality comparison of ``DataFrame`` and `other` (binary operator `eq`).
"""
return self._binary_op(
"eq", other, axis=axis, level=level, broadcast=isinstance(other, Series)
)
def equals(self, other): # noqa: PR01, RT01, D200
"""
Test whether two objects contain the same elements.
"""
if isinstance(other, pandas.DataFrame):
# Copy into a Modin DataFrame to simplify logic below
other = self.__constructor__(other)
return (
self.index.equals(other.index)
and self.columns.equals(other.columns)
and self.eq(other).all().all()
)
def _update_var_dicts_in_kwargs(self, expr, kwargs):
"""
Copy variables with "@" prefix in `local_dict` and `global_dict` keys of kwargs.
Parameters
----------
expr : str
The expression string to search variables with "@" prefix.
kwargs : dict
See the documentation for eval() for complete details on the keyword arguments accepted by query().
"""
if "@" not in expr:
return
frame = sys._getframe()
try:
f_locals = frame.f_back.f_back.f_back.f_back.f_locals
f_globals = frame.f_back.f_back.f_back.f_back.f_globals
finally:
del frame
local_names = set(re.findall(r"@([\w]+)", expr))
local_dict = {}
global_dict = {}
for name in local_names:
for dct_out, dct_in in ((local_dict, f_locals), (global_dict, f_globals)):
try:
dct_out[name] = dct_in[name]
except KeyError:
pass
if local_dict:
local_dict.update(kwargs.get("local_dict") or {})
kwargs["local_dict"] = local_dict
if global_dict:
global_dict.update(kwargs.get("global_dict") or {})
kwargs["global_dict"] = global_dict
def eval(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200
"""
Evaluate a string describing operations on ``DataFrame`` columns.
"""
self._validate_eval_query(expr, **kwargs)
inplace = validate_bool_kwarg(inplace, "inplace")
self._update_var_dicts_in_kwargs(expr, kwargs)
new_query_compiler = self._query_compiler.eval(expr, **kwargs)
return_type = type(
pandas.DataFrame(columns=self.columns)
.astype(self.dtypes)
.eval(expr, **kwargs)
).__name__
if return_type == type(self).__name__:
return self._create_or_update_from_compiler(new_query_compiler, inplace)
else:
if inplace:
raise ValueError("Cannot operate inplace if there is no assignment")
return getattr(sys.modules[self.__module__], return_type)(
query_compiler=new_query_compiler
)
def fillna(
self,
value=None,
method=None,
axis=None,
inplace=False,
limit=None,
downcast=None,
): # noqa: PR01, RT01, D200
"""
Fill NA/NaN values using the specified method.
"""
return super(DataFrame, self).fillna(
squeeze_self=False,
squeeze_value=isinstance(value, Series),
value=value,
method=method,
axis=axis,
inplace=inplace,
limit=limit,
downcast=downcast,
)
def floordiv(
self, other, axis="columns", level=None, fill_value=None
): # noqa: PR01, RT01, D200
"""
Get integer division of ``DataFrame`` and `other`, element-wise (binary operator `floordiv`).
"""
return self._binary_op(
"floordiv",
other,
axis=axis,
level=level,
fill_value=fill_value,
broadcast=isinstance(other, Series),
)
@classmethod
def from_dict(
cls, data, orient="columns", dtype=None, columns=None
): # pragma: no cover # noqa: PR01, RT01, D200
"""
Construct ``DataFrame`` from dict of array-like or dicts.
"""
ErrorMessage.default_to_pandas("`from_dict`")
return from_pandas(
pandas.DataFrame.from_dict(
data, orient=orient, dtype=dtype, columns=columns
)
)
@classmethod
def from_records(
cls,
data,
index=None,
exclude=None,
columns=None,
coerce_float=False,
nrows=None,
): # pragma: no cover # noqa: PR01, RT01, D200
"""
Convert structured or record ndarray to ``DataFrame``.
"""
ErrorMessage.default_to_pandas("`from_records`")
return from_pandas(
pandas.DataFrame.from_records(
data,
index=index,
exclude=exclude,
columns=columns,
coerce_float=coerce_float,
nrows=nrows,
)
)
def ge(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200
"""
Get greater than or equal comparison of ``DataFrame`` and `other`, element-wise (binary operator `ge`).
"""
return self._binary_op(
"ge", other, axis=axis, level=level, broadcast=isinstance(other, Series)
)
def gt(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200
"""
Get greater than comparison of ``DataFrame`` and `other`, element-wise (binary operator `ge`).
"""
return self._binary_op(
"gt", other, axis=axis, level=level, broadcast=isinstance(other, Series)
)
def hist(
self,
column=None,
by=None,