-
Notifications
You must be signed in to change notification settings - Fork 10
/
split.py
22 lines (16 loc) · 841 Bytes
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
"""
This module defines the following routines used by the 'split' step:
- ``create_dataset_filter``: Defines customizable logic for filtering the training,
datasets produced by the data splitting procedure. Note that arbitrary transformations
should go into the transform step.
"""
from pandas import DataFrame, Series
def create_dataset_filter(dataset: DataFrame) -> Series(bool):
"""
Mark rows of the split datasets to be additionally filtered. This function will be called on
the training datasets.
:param dataset: The {train,validation,test} dataset produced by the data splitting procedure.
:return: A Series indicating whether each row should be filtered
"""
# FIXME::OPTIONAL: implement post-split filtering on the dataframes, such as data cleaning.
return Series(True, index=dataset.index)