Skip to content

Commit

Permalink
dropna and freqdrop added and tested
Browse files Browse the repository at this point in the history
  • Loading branch information
ShayPalachy committed Jan 14, 2018
1 parent 8e96df4 commit c989520
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .coveragerc
Expand Up @@ -4,6 +4,7 @@ omit =
tests/*
pdpipe/_version.py
pdpipe/__init__.py
pdpipe/polyglot_stages.py
[report]
show_missing = True
# Regexes for lines to exclude from consideration
Expand All @@ -12,4 +13,4 @@ exclude_lines =
pragma: no cover

# Don't complain if tests don't hit defensive assertion code:
raise NotImplementedError
raise NotImplementedError
1 change: 1 addition & 0 deletions pdpipe/__init__.py
Expand Up @@ -19,6 +19,7 @@
ValKeep,
ColRename,
DropNa,
FreqDrop,
)
core.__load_stage_attributes_from_module__('pdpipe.basic_stages')

Expand Down
69 changes: 67 additions & 2 deletions pdpipe/basic_stages.py
Expand Up @@ -255,8 +255,19 @@ def _op(self, df, verbose):


class DropNa(PipelineStage):
"""A pipeline stage that drops null values. Supports all parameters
supported by pandas.dropna function."""
"""A pipeline stage that drops null values.
Supports all parameter supported by pandas.dropna function.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,None],[1,11]], [1,2,3], ['a','b'])
>>> pdp.DropNa().apply(df)
a b
1 1 4
3 1 11
"""

_DEF_DROPNA_EXC_MSG = "DropNa stage failed."
_DEF_DROPNA_APP_MSG = "Dropping null values..."
Expand All @@ -282,3 +293,57 @@ def _op(self, df, verbose):
if verbose:
print("{} rows dropeed".format(before_count - len(inter_df)))
return inter_df


class FreqDrop(PipelineStage):
"""A pipeline stage that drops rows by value frequency.
Parameters
----------
threshold : int
The minimum frequency required for a value to be kept.
column : str
The name of the colums to check for the given value frequency.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,5],[1,11]], [1,2,3], ['a','b'])
>>> pdp.FreqDrop(2, 'a').apply(df)
a b
1 1 4
3 1 11
"""

_DEF_FREQDROP_EXC_MSG = ("FreqDrop stage failed because column {} was not"
" found in input dataframe.")
_DEF_FREQDROP_APPLY_MSG = ("Dropping values with frequency < {} in column"
" {}...")
_DEF_FREQDROP_DESC = "Drop values with frequency < {} in column {}."

def __init__(self, threshold, column, **kwargs):
self._threshold = threshold
self._column = column
apply_msg = FreqDrop._DEF_FREQDROP_APPLY_MSG.format(
self._threshold, self._column)
super_kwargs = {
'exmsg': FreqDrop._DEF_FREQDROP_EXC_MSG.format(self._column),
'appmsg': apply_msg,
'desc': FreqDrop._DEF_FREQDROP_DESC.format(
self._threshold, self._column)
}
super_kwargs.update(**kwargs)
super().__init__(**super_kwargs)

def _prec(self, df):
return self._column in df.columns

def _op(self, df, verbose):
inter_df = df
before_count = len(inter_df)
valcount = df[self._column].value_counts()
to_drop = valcount[valcount < self._threshold].index
inter_df = inter_df[~inter_df[self._column].isin(to_drop)]
if verbose:
print("{} rows dropped.".format(before_count - len(inter_df)))
return inter_df
24 changes: 24 additions & 0 deletions tests/basic_stages/test_dropna.py
@@ -0,0 +1,24 @@
"""Testing basic pipeline stages."""

import pandas as pd

from pdpipe.basic_stages import DropNa


DF1 = pd.DataFrame([[1, 4], [4, None], [1, 11]], [1, 2, 3], ['a', 'b'])


def test_dropna_basic():
"""Testing the DropNa pipeline stage."""
res_df = DropNa().apply(DF1)
assert 1 in res_df.index
assert 2 not in res_df.index
assert 3 in res_df.index


def test_dropna_verbose():
"""Testing the DropNa pipeline stage."""
res_df = DropNa().apply(DF1, verbose=True)
assert 1 in res_df.index
assert 2 not in res_df.index
assert 3 in res_df.index
24 changes: 24 additions & 0 deletions tests/basic_stages/test_freq_drop.py
@@ -0,0 +1,24 @@
"""Testing basic pipeline stages."""

import pandas as pd

from pdpipe.basic_stages import FreqDrop


DF1 = pd.DataFrame([[1, 4], [4, 5], [1, 11]], [1, 2, 3], ['a', 'b'])


def test_freqdrop_basic():
"""Testing the FreqDrop pipeline stage."""
res_df = FreqDrop(2, 'a').apply(DF1)
assert 1 in res_df.index
assert 2 not in res_df.index
assert 3 in res_df.index


def test_freqdrop_verbose():
"""Testing the FreqDrop pipeline stage."""
res_df = FreqDrop(2, 'a').apply(DF1, verbose=True)
assert 1 in res_df.index
assert 2 not in res_df.index
assert 3 in res_df.index
9 changes: 9 additions & 0 deletions tests/basic_stages/test_val_drop.py
Expand Up @@ -14,6 +14,15 @@ def test_valdrop_with_columns():
assert 3 in res_df.index


def test_valdrop_with_columns_verbose():
"""Testing the ColDrop pipeline stage."""
df = pd.DataFrame([[1, 4], [4, 5], [18, 11]], [1, 2, 3], ['a', 'b'])
res_df = ValDrop([4], 'a').apply(df, verbose=True)
assert 1 in res_df.index
assert 2 not in res_df.index
assert 3 in res_df.index


def test_valdrop_without_columns():
"""Testing the ColDrop pipeline stage."""
df = pd.DataFrame([[1, 4], [4, 5], [18, 11]], [1, 2, 3], ['a', 'b'])
Expand Down

0 comments on commit c989520

Please sign in to comment.