-
-
Notifications
You must be signed in to change notification settings - Fork 284
/
pandas.py
105 lines (86 loc) · 2.91 KB
/
pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Module for inferring dataframe/series schema."""
from typing import overload
import pandas as pd
from pandera.api.pandas.array import SeriesSchema
from pandera.api.pandas.components import Column, Index, MultiIndex
from pandera.api.pandas.container import DataFrameSchema
from pandera.schema_statistics.pandas import (
infer_dataframe_statistics,
infer_series_statistics,
parse_check_statistics,
)
@overload
def infer_schema(
pandas_obj: pd.Series,
) -> SeriesSchema: # pragma: no cover
...
@overload
def infer_schema( # type: ignore[misc]
pandas_obj: pd.DataFrame,
) -> DataFrameSchema: # pragma: no cover
...
def infer_schema(pandas_obj):
"""Infer schema for pandas DataFrame or Series object.
:param pandas_obj: DataFrame or Series object to infer.
:returns: DataFrameSchema or SeriesSchema
:raises: TypeError if pandas_obj is not expected type.
"""
if isinstance(pandas_obj, pd.DataFrame):
return infer_dataframe_schema(pandas_obj)
elif isinstance(pandas_obj, pd.Series):
return infer_series_schema(pandas_obj)
else:
raise TypeError(
"pandas_obj type not recognized. Expected a pandas DataFrame or "
f"Series, found {type(pandas_obj)}"
)
def _create_index(index_statistics):
index = [
Index(
properties["dtype"],
checks=parse_check_statistics(properties["checks"]),
nullable=properties["nullable"],
name=properties["name"],
)
for properties in index_statistics
]
if len(index) == 1:
index = index[0] # type: ignore
else:
index = MultiIndex(index) # type: ignore
return index
def infer_dataframe_schema(df: pd.DataFrame) -> DataFrameSchema:
"""Infer a DataFrameSchema from a pandas DataFrame.
:param df: DataFrame object to infer.
:returns: DataFrameSchema
"""
df_statistics = infer_dataframe_statistics(df)
schema = DataFrameSchema(
columns={
colname: Column(
properties["dtype"],
checks=parse_check_statistics(properties["checks"]),
nullable=properties["nullable"],
)
for colname, properties in df_statistics["columns"].items()
},
index=_create_index(df_statistics["index"]),
coerce=True,
)
schema._is_inferred = True
return schema
def infer_series_schema(series) -> SeriesSchema:
"""Infer a SeriesSchema from a pandas DataFrame.
:param series: Series object to infer.
:returns: SeriesSchema
"""
series_statistics = infer_series_statistics(series)
schema = SeriesSchema(
dtype=series_statistics["dtype"],
checks=parse_check_statistics(series_statistics["checks"]),
nullable=series_statistics["nullable"],
name=series_statistics["name"],
coerce=True,
)
schema._is_inferred = True
return schema