In [1]:
import pandas as pd
import pdpipe as pdp

In [2]:
df = pd.DataFrame(
    data=[
        [23, 'Jo', 'M', True, 0.07, 'USA', 'Living life to its fullest'],
        [23, 'Dana', 'F', True, 0.3, 'USA', 'the pen is mightier then the sword'],
        [25, 'Bo', 'M', False, 2.3, 'Greece', 'all for one and one for all'],
        [44, 'Derek', 'M', True, 1.1, 'Denmark', 'every life is precious'],
        [72, 'Regina', 'F', True, 7.1, 'Greece', 'all of you get off my porch'],
        [50, 'Jim', 'M', False, 0.2, 'Germany', 'boy do I love dogs and cats'],
        [80, 'Richy', 'M', False, 100.2, 'Finland', 'I gots the dollarz'],
        [80, 'Wealthus', 'F', False, 123.2, 'Finland', 'me likey them moniez'],
    ],
    columns=['Age', 'Name', 'Gender', 'Smoking', 'Savings', 'Country', 'Quote'],
)

In [3]:
df

Unnamed: 0,Age,Name,Gender,Smoking,Savings,Country,Quote
0,23,Jo,M,True,0.07,USA,Living life to its fullest
1,23,Dana,F,True,0.3,USA,the pen is mightier then the sword
2,25,Bo,M,False,2.3,Greece,all for one and one for all
3,44,Derek,M,True,1.1,Denmark,every life is precious
4,72,Regina,F,True,7.1,Greece,all of you get off my porch
5,50,Jim,M,False,0.2,Germany,boy do I love dogs and cats
6,80,Richy,M,False,100.2,Finland,I gots the dollarz
7,80,Wealthus,F,False,123.2,Finland,me likey them moniez


In [4]:
pipeline = pdp.ColDrop('Name').RowDrop({'Savings': lambda x: x > 100}).Bin({'Savings': [1]}, drop=False).Scale(
    'StandardScaler').TokenizeText('Quote').SnowballStem('EnglishStemmer', columns=['Quote']).RemoveStopwords(
    'English', 'Quote').Encode('Gender').OneHotEncode('Country')

In [6]:
pipeline

A pdpipe pipeline:
[ 0]  Drop columns Name
[ 1]  Drop rows in columns Savings by conditions
[ 2]  Bin Savings by [1].
[ 3]  Scale columns Columns of dtypes <class 'numpy.number'>
[ 4]  Tokenize Quote
[ 5]  Stemming tokens in Quote...
[ 6]  Remove stopwords from Quote
[ 7]  Encode Gender
[ 8]  One-hot encode Country

In [13]:
pipeline(df, verbose=True).head(4)

- Drop columns Name
- Drop rows in columns Savings by conditions
2 rows dropped.
- Bin Savings by [1].


Savings: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 89.25it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>
- Tokenize Quote
- Stemming tokens in Quote...
- Remove stopwords from Quote
- Encode Gender
- One-hot encode Country





Unnamed: 0,Age,Gender,Smoking,Savings,Savings_bin,Quote,Country_Germany,Country_Greece,Country_USA
0,-1.135052,1,True,-0.609615,<1,"[live, life, fullest]",0,0,1
1,-1.135052,0,True,-0.60482,<1,"[pen, mightier, sword]",0,0,1
2,-1.04979,1,False,-0.563121,1≤,"[one, one]",0,1,0
3,-0.2398,1,True,-0.58814,1≤,"[everi, life, precious]",0,0,0


In [11]:
pipeline[2:4](df, verbose=True).head(4)

- Bin Savings by [1].


Savings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 274.59it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>





Unnamed: 0,Age,Name,Gender,Smoking,Savings,Savings_bin,Country,Quote
0,-1.135052,Jo,M,True,-0.609615,<1,USA,Living life to its fullest
1,-1.135052,Dana,F,True,-0.60482,<1,USA,the pen is mightier then the sword
2,-1.04979,Bo,M,False,-0.563121,1≤,Greece,all for one and one for all
3,-0.2398,Derek,M,True,-0.58814,1≤,Denmark,every life is precious


In [9]:
pl = pdp.ColDrop('Name').RowDrop({'Savings': lambda x: x > 100}).Bin({'Savings': [1]}, drop=False).Scale(
    'StandardScaler').TokenizeText('Quote').SnowballStem('EnglishStemmer', columns=['Quote']).RemoveStopwords(
    'English', 'Quote').Encode(['Gender', 'Savings_bin']).OneHotEncode('Country')

In [12]:
pl(df).head(4)

Unnamed: 0,Age,Gender,Smoking,Savings,Savings_bin,Quote,Country_Germany,Country_Greece,Country_USA
0,-0.917257,1,True,-0.718473,1,"[live, life, fullest]",0,0,1
1,-0.917257,0,True,-0.625375,1,"[pen, mightier, sword]",0,0,1
2,-0.806074,1,False,0.184172,0,"[one, one]",0,1,0
3,0.250161,1,True,-0.301556,0,"[everi, life, precious]",0,0,0
