In [1]:
import pandas as pd
import pdpipe as pdp

In [2]:
df = pd.DataFrame(
    data=[
        [23, 'Jo', 'M', True, 0.07, 'USA', 'Living life to its fullest'],
        [23, 'Dana', 'F', True, 0.3, 'USA', 'the pen is mightier then the sword'],
        [25, 'Bo', 'M', False, 2.3, 'Greece', 'all for one and one for all'],
        [44, 'Derek', 'M', True, 1.1, 'Denmark', 'every life is precious'],
        [72, 'Regina', 'F', True, 7.1, 'Greece', 'all of you get off my porch'],
        [50, 'Jim', 'M', False, 0.2, 'Germany', 'boy do I love dogs and cats'],
        [80, 'Richy', 'M', False, 100.2, 'Finland', 'I gots the dollarz'],
        [80, 'Wealthus', 'F', False, 123.2, 'Finland', 'me likey them moniez'],
    ],
    columns=['Age', 'Name', 'Gender', 'Smoking', 'Savings', 'Country', 'Quote'],
)

In [3]:
df

Unnamed: 0,Age,Name,Gender,Smoking,Savings,Country,Quote
0,23,Jo,M,True,0.07,USA,Living life to its fullest
1,23,Dana,F,True,0.3,USA,the pen is mightier then the sword
2,25,Bo,M,False,2.3,Greece,all for one and one for all
3,44,Derek,M,True,1.1,Denmark,every life is precious
4,72,Regina,F,True,7.1,Greece,all of you get off my porch
5,50,Jim,M,False,0.2,Germany,boy do I love dogs and cats
6,80,Richy,M,False,100.2,Finland,I gots the dollarz
7,80,Wealthus,F,False,123.2,Finland,me likey them moniez


In [4]:
pipeline = pdp.ColDrop('Name').RowDrop({'Savings': lambda x: x > 100}).Bin({'Savings': [1]}, drop=False).Scale(
    'StandardScaler').TokenizeText('Quote').SnowballStem('EnglishStemmer', columns=['Quote']).RemoveStopwords(
    'English', 'Quote').Encode('Gender').OneHotEncode('Country')

In [5]:
pdp.cq.OfDtypes(int).__doc__

"Columns of dtypes <class 'int'>"

In [6]:
pipeline

A pdpipe pipeline:
[ 0]  Drop columns Name
[ 1]  Drop rows in columns Savings by conditions
[ 2]  Bin Savings by [1].
[ 3]  Scale columns Columns of dtypes <class 'numpy.number'>
[ 4]  Tokenize Quote
[ 5]  Stemming tokens in Quote...
[ 6]  Remove stopwords from Quote
[ 7]  Encode Gender
[ 8]  One-hot encode Country

In [7]:
print(pipeline[0]._mem_str())

  - _DEF_DESCRIPTION, 72b (04.50%)
  - _DEF_EXC_MSG, 88b (05.50%)
  - _INIT_KWARGS, 264b (16.50%)
  - _abc_impl, 48b (03.00%)
  - _appmsg, 72b (04.50%)
  - _col_arg, 136b (08.50%)
  - _col_str, 56b (03.50%)
  - _desc, 72b (04.50%)
  - _errors, 16b (01.00%)
  - _exclude_columns, 16b (01.00%)
  - _exmsg, 136b (08.50%)
  - _exraise, 32b (02.00%)
  - _none_cols, 16b (01.00%)
  - _none_error, 32b (02.00%)
  - _prec_arg, 16b (01.00%)
  - _skip, 16b (01.00%)
  - is_fitted, 24b (01.50%)



In [8]:
pipeline.memory_report()

=== Pipeline memory report ===
Total pipeline size in memory: 27.62Kb
Per-stage memory structure:
[ 0] 1.60Kb (05.79%), Drop columns Name
  - _DEF_DESCRIPTION, 72b (04.50%)
  - _DEF_EXC_MSG, 88b (05.50%)
  - _INIT_KWARGS, 264b (16.50%)
  - _abc_impl, 48b (03.00%)
  - _appmsg, 72b (04.50%)
  - _col_arg, 136b (08.50%)
  - _col_str, 56b (03.50%)
  - _desc, 72b (04.50%)
  - _errors, 16b (01.00%)
  - _exclude_columns, 16b (01.00%)
  - _exmsg, 136b (08.50%)
  - _exraise, 32b (02.00%)
  - _none_cols, 16b (01.00%)
  - _none_error, 32b (02.00%)
  - _prec_arg, 16b (01.00%)
  - _skip, 16b (01.00%)
  - is_fitted, 24b (01.50%)
[ 1] 3.08Kb (11.15%), Drop rows in columns Savings by conditions
  - _DEF_DESCRIPTION, 72b (02.34%)
  - _DEF_EXC_MSG, 88b (02.86%)
  - _INIT_KWARGS, 264b (08.57%)
  - _REDUCERS, 416b (13.51%)
  - _abc_impl, 48b (01.56%)
  - _appmsg, 96b (03.12%)
  - _col_arg, 160b (05.19%)
  - _col_str, 56b (01.82%)
  - _cond_is_dict, 32b (01.04%)
  - _conditions, 304b (09.87%)
  - _desc, 96b

In [9]:
pipeline(df, verbose=True)

- Drop columns Name..
- Drop rows in columns Savings by conditions..
2 rows dropped.
- Bin Savings by [1]...


Savings: 100%|██████████| 1/1 [00:00<00:00, 108.37it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>..
- Tokenize Quote..
- Stemming tokens in Quote.....





- Remove stopwords from Quote..
- Encode Gender..


100%|██████████| 1/1 [00:00<00:00, 20.50it/s]

- One-hot encode Country..



Country: 100%|██████████| 1/1 [00:00<00:00, 33.02it/s]


Unnamed: 0,Age,Gender,Smoking,Savings,Savings_bin,Quote,Country_Germany,Country_Greece,Country_USA
0,-0.917257,1,True,-0.718473,<1,"[live, life, fullest]",0,0,1
1,-0.917257,0,True,-0.625375,<1,"[pen, mightier, sword]",0,0,1
2,-0.806074,1,False,0.184172,1≤,"[one, one]",0,1,0
3,0.250161,1,True,-0.301556,1≤,"[everi, life, precious]",0,0,0
4,1.806718,0,True,2.127084,1≤,"[get, porch]",0,1,0
5,0.583709,1,False,-0.665852,<1,"[boy, love, dog, cat]",1,0,0


In [10]:
pipeline[2:4](df, verbose=True)

- Bin Savings by [1]...


Savings: 100%|██████████| 1/1 [00:00<00:00, 22.66it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>..





Unnamed: 0,Age,Name,Gender,Smoking,Savings,Savings_bin,Country,Quote
0,-1.135052,Jo,M,True,-0.609615,<1,USA,Living life to its fullest
1,-1.135052,Dana,F,True,-0.60482,<1,USA,the pen is mightier then the sword
2,-1.04979,Bo,M,False,-0.563121,1≤,Greece,all for one and one for all
3,-0.2398,Derek,M,True,-0.58814,1≤,Denmark,every life is precious
4,0.95387,Regina,F,True,-0.463043,1≤,Greece,all of you get off my porch
5,0.015987,Jim,M,False,-0.606905,<1,Germany,boy do I love dogs and cats
6,1.294918,Richy,M,False,1.478052,1≤,Finland,I gots the dollarz
7,1.294918,Wealthus,F,False,1.957592,1≤,Finland,me likey them moniez
