In [1]:
import pandas as pd
import pdpipe as pdp

In [2]:
df = pd.DataFrame(
    data=[
        [23, "Jo", "M", True, 0.07, "USA", "Living life to its fullest"],
        [
            23,
            "Dana",
            "F",
            True,
            0.3,
            "USA",
            "the pen is mightier then the sword",
        ],
        [25, "Bo", "M", False, 2.3, "Greece", "all for one and one for all"],
        [44, "Derek", "M", True, 1.1, "Denmark", "every life is precious"],
        [
            72,
            "Regina",
            "F",
            True,
            7.1,
            "Greece",
            "all of you get off my porch",
        ],
        [50, "Jim", "M", False, 0.2, "Germany", "boy do I love dogs and cats"],
        [80, "Richy", "M", False, 100.2, "Finland", "I gots the dollarz"],
        [80, "Wealthus", "F", False, 123.2, "Finland", "me likey them moniez"],
    ],
    columns=[
        "Age",
        "Name",
        "Gender",
        "Smoking",
        "Savings",
        "Country",
        "Quote",
    ],
)

In [3]:
df

Unnamed: 0,Age,Name,Gender,Smoking,Savings,Country,Quote
0,23,Jo,M,True,0.07,USA,Living life to its fullest
1,23,Dana,F,True,0.3,USA,the pen is mightier then the sword
2,25,Bo,M,False,2.3,Greece,all for one and one for all
3,44,Derek,M,True,1.1,Denmark,every life is precious
4,72,Regina,F,True,7.1,Greece,all of you get off my porch
5,50,Jim,M,False,0.2,Germany,boy do I love dogs and cats
6,80,Richy,M,False,100.2,Finland,I gots the dollarz
7,80,Wealthus,F,False,123.2,Finland,me likey them moniez


In [5]:
pipeline = (
    pdp.ColDrop("Name")
    .RowDrop({"Savings": lambda x: x > 100})
    .Bin({"Savings": [1]}, drop=False)
    .Scale("StandardScaler")
    .TokenizeText("Quote")
    .SnowballStem("EnglishStemmer", columns=["Quote"])
    .RemoveStopwords("English", "Quote")
    .Encode("Gender")
    .OneHotEncode("Country")
)

In [6]:
pipeline

A pdpipe pipeline:
[ 0]  Drop columns Name
[ 1]  Drop rows in columns Savings by conditions
[ 2]  Bin Savings by [1].
[ 3]  Scale columns Selectes all columns that are of a given dtypes.
      Use `dtypes=np.number` to qualify all numeric columns.      Parameters
      ----------     dtypes : object or list of objects         The dtype or
      dtypes which qualify columns. Support all valid arguments         to
      the `include` parameter of pandas.DataFrame.select_dtypes().
      **kwargs         Additionaly accepts all keyword arguments of the
      constructor of         ColumnQualifier. See the documentation of
      ColumnQualifier for details.      Example     -------         >>>
      import pandas as pd; import pdpipe as pdp; import numpy as np;
      >>> df = pd.DataFrame(         ...    [[8.2,'a',5],[5.1,'b',7]],
      [1,2], ['ph', 'grade', 'age'])         >>> cq =
      pdp.cq.OfDtypes(np.number)         >>> cq(df)         ['ph', 'age']
      >>> cq = pdp.cq.OfDtypes([np

In [7]:
pipeline(df, verbose=True)

- Drop columns Name..
- Drop rows in columns Savings by conditions..
2 rows dropped.
- Bin Savings by [1]...


Savings: 100%|██████████| 1/1 [00:00<00:00, 75.15it/s]

- Scale columns Selectes all columns that are of a given dtypes.
  Use `dtypes=np.number` to qualify all numeric columns.      Parameters
  ----------     dtypes : object or list of objects         The dtype or
  dtypes which qualify columns. Support all valid arguments         to
  the `include` parameter of pandas.DataFrame.select_dtypes().
  **kwargs         Additionaly accepts all keyword arguments of the
  constructor of         ColumnQualifier. See the documentation of
  ColumnQualifier for details.      Example     -------         >>>
  import pandas as pd; import pdpipe as pdp; import numpy as np;
  >>> df = pd.DataFrame(         ...    [[8.2,'a',5],[5.1,'b',7]],
  [1,2], ['ph', 'grade', 'age'])         >>> cq =
  pdp.cq.OfDtypes(np.number)         >>> cq(df)         ['ph', 'age']
  >>> cq = pdp.cq.OfDtypes([np.number, object])         >>> cq(df)
  ['ph', 'grade', 'age']         >>> cq = pdp.cq.OfDtypes(np.int64)
  >>> cq         <ColumnQualifier: With dtypes in <class 'numpy.i


100%|██████████| 1/1 [00:00<00:00, 39.69it/s]

- One-hot encode Country..



Country: 100%|██████████| 1/1 [00:00<00:00, 24.70it/s]


Unnamed: 0,Age,Gender,Smoking,Savings,Savings_bin,Quote,Country_Germany,Country_Greece,Country_USA
0,-0.917257,1,True,-0.718473,<1,"[live, life, fullest]",0,0,1
1,-0.917257,0,True,-0.625375,<1,"[pen, mightier, sword]",0,0,1
2,-0.806074,1,False,0.184172,1≤,"[one, one]",0,1,0
3,0.250161,1,True,-0.301556,1≤,"[everi, life, precious]",0,0,0
4,1.806718,0,True,2.127084,1≤,"[get, porch]",0,1,0
5,0.583709,1,False,-0.665852,<1,"[boy, love, dog, cat]",1,0,0


In [73]:
pipeline[2:4](df, verbose=True)

- Binning column Savings...


Savings: 100%|██████████| 1/1 [00:00<00:00, 153.37it/s]

- Scaling data...





Unnamed: 0,Age,Name,Gender,Smoking,Savings,Savings_bin,Country,Quote
0,-0.917257,Jo,M,0.707107,-0.718473,<1,USA,Living life to its fullest
1,-0.917257,Dana,F,0.707107,-0.625375,<1,USA,the pen is mightier then the sword
2,-0.806074,Bo,M,-1.414214,0.184172,1≤,Greece,all for one and one for all
3,0.250161,Derek,M,0.707107,-0.301556,1≤,Denmark,every life is precious
4,1.806718,Regina,F,0.707107,2.127084,1≤,Greece,all of you get off my porch
5,0.583709,Jim,M,-1.414214,-0.665852,<1,Germany,boy do I love dogs and cats
6,2.251448,Richy,M,-1.414214,39.811492,1≤,Finland,I gots the dollarz
7,2.251448,Wealthus,F,-1.414214,49.121281,1≤,Finland,me likey them moniez
