-
-
Notifications
You must be signed in to change notification settings - Fork 284
/
components.py
251 lines (212 loc) · 8.52 KB
/
components.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
"""Schema components for polars."""
from __future__ import annotations
import logging
from typing import Any, Optional
import polars as pl
from pandera.api.base.types import CheckList
from pandera.api.dataframe.components import ComponentSchema
from pandera.api.polars.types import PolarsCheckObjects, PolarsDtypeInputTypes
from pandera.backends.polars.register import register_polars_backends
from pandera.config import config_context, get_config_context
from pandera.engines import polars_engine
from pandera.utils import is_regex
logger = logging.getLogger(__name__)
class Column(ComponentSchema[PolarsCheckObjects]):
"""Polars column schema component."""
def __init__(
self,
dtype: PolarsDtypeInputTypes = None,
checks: Optional[CheckList] = None,
nullable: bool = False,
unique: bool = False,
coerce: bool = False,
required: bool = True,
name: Optional[str] = None,
regex: bool = False,
title: Optional[str] = None,
description: Optional[str] = None,
default: Optional[Any] = None,
metadata: Optional[dict] = None,
drop_invalid_rows: bool = False,
**column_kwargs,
) -> None:
"""Create column validator object.
:param dtype: datatype of the column. The datatype for type-checking
a dataframe. All `polars datatypes <https://docs.pola.rs/py-polars/html/reference/datatypes.html>`__,
supported built-in python types that are supported by polars,
and the pandera polars engine :ref:`datatypes <polars-dtypes>`.
:param checks: checks to verify validity of the column
:param nullable: Whether or not column can contain null values.
:param unique: whether column values should be unique
:param coerce: If True, when schema.validate is called the column will
be coerced into the specified dtype. This has no effect on columns
where ``dtype=None``.
:param required: Whether or not column is allowed to be missing
:param name: column name in dataframe to validate. Names in the format
'^{regex_pattern}$' are treated as regular expressions. During
validation, this schema will be applied to any columns matching this
pattern.
:param regex: whether the ``name`` attribute should be treated as a
regex pattern to apply to multiple columns in a dataframe. If the
name is a regular expression, this attribute will automatically be
set to True.
:param title: A human-readable label for the column.
:param description: An arbitrary textual description of the column.
:param default: The default value for missing values in the column.
:param metadata: An optional key value data.
:param drop_invalid_rows: if True, drop invalid rows on validation.
:raises SchemaInitError: if impossible to build schema from parameters
:example:
>>> import pandas as pd
>>> import pandera as pa
>>>
>>>
>>> schema = pa.DataFrameSchema({
... "column": pa.Column(str)
... })
>>>
>>> schema.validate(pd.DataFrame({"column": ["foo", "bar"]}))
column
0 foo
1 bar
See :ref:`here<column>` for more usage details.
"""
super().__init__(
dtype=dtype,
checks=checks,
nullable=nullable,
unique=unique,
coerce=coerce,
name=name,
title=title,
description=description,
default=default,
metadata=metadata,
drop_invalid_rows=drop_invalid_rows,
**column_kwargs,
)
self.required = required
self.regex = regex
self.name = name
self.set_regex()
def _register_default_backends(self):
register_polars_backends()
def validate(
self,
check_obj: PolarsCheckObjects,
head: Optional[int] = None,
tail: Optional[int] = None,
sample: Optional[int] = None,
random_state: Optional[int] = None,
lazy: bool = False,
inplace: bool = False,
) -> PolarsCheckObjects:
"""Validate a Column in a DataFrame object.
:param check_obj: polars LazyFrame to validate.
:param head: validate the first n rows. Rows overlapping with `tail` or
`sample` are de-duplicated.
:param tail: validate the last n rows. Rows overlapping with `head` or
`sample` are de-duplicated.
:param sample: validate a random sample of n rows. Rows overlapping
with `head` or `tail` are de-duplicated.
:param random_state: random seed for the ``sample`` argument.
:param lazy: if True, lazily evaluates dataframe against all validation
checks and raises a ``SchemaErrors``. Otherwise, raise
``SchemaError`` as soon as one occurs.
:param inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated DataFrame.
"""
is_dataframe = isinstance(check_obj, pl.DataFrame)
if is_dataframe:
check_obj = check_obj.lazy()
config_ctx = get_config_context(validation_depth_default=None)
validation_depth = config_ctx.validation_depth
with config_context(validation_depth=validation_depth):
output = self.get_backend(check_obj).validate(
check_obj,
self,
head=head,
tail=tail,
sample=sample,
random_state=random_state,
lazy=lazy,
inplace=inplace,
)
return output
@property
def properties(self) -> dict[str, Any]:
"""Get column properties."""
return {
"dtype": self.dtype,
"parsers": self.parsers,
"checks": self.checks,
"nullable": self.nullable,
"unique": self.unique,
"report_duplicates": self.report_duplicates,
"coerce": self.coerce,
"required": self.required,
"name": self.name,
"regex": self.regex,
"title": self.title,
"description": self.description,
"default": self.default,
"metadata": self.metadata,
}
@property
def dtype(self):
return self._dtype
@dtype.setter
def dtype(self, value) -> None:
self._dtype = polars_engine.Engine.dtype(value) if value else None
@property
def selector(self):
if self.name is not None and not is_regex(self.name) and self.regex:
return f"^{self.name}$"
return self.name
def set_regex(self):
if self.name is None:
return
if is_regex(self.name) and not self.regex:
logger.info(
f"Column schema '{self.name}' is a regex expression. "
"Setting regex=True."
)
self.regex = True
def set_name(self, name: str):
"""Set the name of the schema.
If the name is a regex starting with '^' and ending with '$'
set the regex attribute to True.
"""
self.name = name
self.set_regex()
return self
def strategy(self, *, size=None):
"""Create a ``hypothesis`` strategy for generating a Column.
:param size: number of elements to generate
:returns: a dataframe strategy for a single column.
.. warning::
This method is not implemented in the polars backend.
"""
raise NotImplementedError(
"Data synthesis is not supported in with polars schemas."
)
def strategy_component(self):
"""Generate column data object for use by DataFrame strategy.
.. warning::
This method is not implemented in the polars backend.
"""
raise NotImplementedError(
"Data synthesis is not supported in with polars schemas."
)
def example(self, size=None):
"""Generate an example of a particular size.
:param size: number of elements in the generated Index.
:returns: pandas DataFrame object.
.. warning::
This method is not implemented in the polars backend.
"""
# pylint: disable=import-outside-toplevel,cyclic-import,import-error
raise NotImplementedError(
"Data synthesis is not supported in with polars schemas."
)