# pdpipe - Advanced usage example

# Pipeline + Model definition

In [1]:
from typing import Optional
import pdpipe as pdp
from pdpipe import df
from sklearn.linear_model import LogisticRegression
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

class MyPipelineAndModel(PdPipelineAndSklearnEstimator):
    
    def __init__(
        self,
        savings_max_val: Optional[int] = 100,
        drop_gender: Optional[bool] = False,
        standardize: Optional[bool] = False,
        ohencode_country: Optional[bool] = True,
        savings_bin_val: Optional[int] = None,
        pca_threshold: Optional[int] = 20,
        fit_intercept: Optional[bool] = True,
    ):
        # save pipeline parameters
        self.savings_max_val = savings_max_val
        self.drop_gender = drop_gender
        self.standardize = standardize
        self.ohencode_country = ohencode_country
        self.savings_bin_val = savings_bin_val
        self.pca_threshold = pca_threshold
        self.fit_intercept = fit_intercept
        # init helper lists
        cols_to_drop = ['Bearded']
        cols_to_encode = []
        # start with a prefix of non-optional stages
        stages = [
            # standard pipeline stages
            pdp.ColDrop(columns=pdp.cq.WithAtLeastMissingValueRate(0.2)),
            pdp.DropLabelsByValues(not_in_set=['Smoking', 'Non-Smoking']),
            pdp.EncodeLabel(),
            pdp.ColDrop(['Name'], errors='ignore'),
            # using pdpipe fly-handles 🚀
            df.set_index(keys='id'),
            pdp.drop_rows_where['Savings'] > savings_max_val,
            df['Viking'] << (df['Country'].isin(['Denmark', 'Finland']) & ~df['Bearded']),
            df['YearlyGrands'] << (df['Savings'] * 1000) / df['Age']
        ]
        # a few parameter-dependent pipeline stages
        if savings_bin_val:
            stages.append(pdp.Bin({'Savings': [savings_bin_val]}, drop=False))
            cols_to_encode.append('Savings_bin')
        if drop_gender:
            cols_to_drop.append('Gender')
        else:
            cols_to_encode.append('Gender')
        if ohencode_country:
            stages.append(pdp.OneHotEncode('Country'))
        else:
            cols_to_drop.append('Country')
        # processing the text column:
        # 1. we do this before standardization so tf-idf
        # representation is also standardized
        # 2. we do this after everything else, so all tf-idf
        # columns are last in column order (for ease of presentation)
        stages.extend([       
            pdp.TokenizeText('Quote'),
            pdp.SnowballStem('EnglishStemmer', columns=['Quote']),
            pdp.RemoveStopwords('English', 'Quote'),
            pdp.TfidfVectorizeTokenLists('Quote', hierarchical_labels=True),
        ])
        # PCA all tf-idf columns if there are too many of them
        stages.append(
            pdp.Decompose(
                transformer='PCA',
                columns=pdp.cq.StartsWith('Quote'),
                prec=pdp.cond.HasAtLeastNQualifyingColumns(
                    n=pca_threshold,
                    qualifier=pdp.cq.StartsWith('Quote'),
                ),
                exraise=False,
            )
        )
        # more parameter-dependent pipeline stages
        if len(cols_to_encode) > 0:
            stages.append(pdp.Encode(cols_to_encode))
        if standardize:
            stages.append(pdp.Scale('StandardScaler'))
        # the suffix of non-optional pipeline stages
        stages.extend([
            pdp.ColDrop(cols_to_drop, errors='ignore'),
            pdp.Schematize(),
            pdp.ConditionValidator([
                pdp.cond.HasAtMostNQualifyingColumns(
                    n=150,
                    qualifier=pdp.cq.AllColumns(fittable=False),
                ),
                pdp.cond.HasNoMissingValues(),
            ]),
        ])
        pipeline = pdp.PdPipeline(stages)
        model = LogisticRegression(fit_intercept=fit_intercept)
        super().__init__(pipeline=pipeline, estimator=model)

  from tqdm.autonotebook import tqdm


In [2]:
mp = MyPipelineAndModel(
    savings_max_val=101,
    drop_gender=False,
    standardize=True,
    ohencode_country=True,
    savings_bin_val=1,
    pca_threshold=25,
    fit_intercept=True,
)

In [3]:
mp

<PdPipeline -> LogisticRegression>

In [4]:
mp.pipeline

A pdpipe pipeline:
[ 0]  Drop columns Columns with at least 0.2 missing value rate
[ 1]  Drop labels by values
[ 2]  Encode label values
[ 3]  Drop columns 'Name'
[ 4]  Apply dataframe method set_index with kwargs {'keys': 'id'}
[ 5]  Drop rows by qualifier <RowQualifier: Qualify rows with X[Savings] >
      101>
[ 6]  Assign column Viking with df[Country].isin(['Denmark', 'Finland']) &
      ~df[Bearded]
[ 7]  Assign column YearlyGrands with df[Savings] * 1000 / df[Age]
[ 8]  Bin Savings by [1].
[ 9]  One-hot encode 'Country'
[10]  Tokenize Quote
[11]  Stemming tokens in Quote...
[12]  Remove stopwords from Quote
[13]  Count-vectorizing column Quote.
[14]  Decompose columns Columns that start with Quote with PCA
[15]  Encode 'Savings_bin', 'Gender'
[16]  Scale columns Columns of dtypes <class 'numpy.number'>
[17]  Drop columns 'Bearded'
[18]  Transform input dataframes to the following schema: <Learnable Schema>
[19]  Validates conditions

In [5]:
mp.estimator

LogisticRegression()

## Train-test for the pipeline

In [6]:
LBL_COL = 'Smoking'

In [7]:
import pandas as pd

In [8]:
jan_train = pd.DataFrame(
    data=[
        [23, 'Jo', 'Android', True, 'M', 'Smoking', 0.07, 'USA', 'Living life to its fullest', 7],
        [52, 'Regina', None, True, 'F', 'Non-Smoking', 0.26, 'Germany', 'I hate cats', 2],
        [23, 'Dana', None, True, 'F', 'Smoking', 0.3, 'USA', 'the apen is mightier then the sword', 12],
        [25, 'Bo', None, False, 'M', 'Non-Smoking', 2.3, 'Greece', 'all for one and one for all', 8],
        [80, 'Richy', 'iOS', True, 'M', 'Non-Smoking', 100.2, 'Finland', 'I gots the dollarz', 5],
        [60, 'Paul', None, True, 'M', 'Smoking', 1.87, 'Denmark', 'blah', 9],
        [44, 'Derek', None, False, 'M', 'Smoking', 1.1, 'Denmark', 'every life is precious', 10],
        [72, 'Regina', None, False, 'F', 'Smoking', 7.1, 'Greece', 'all of you get off my porch', 3],
        [50, 'Jim', None, False, 'M', 'Non-Smoking', 0.2, 'Germany', 'boy do I love dogs and cats', 6],
        [80, 'Wealthus', 'iOS', False, 'F', 'Non-Smoking', 123.2, 'Finland', 'me likey them moniez', 19],
        [12, 'Erroneous', None, False, 'M', 'Breathing', 1.2, 'Poland', 'I love bad labels', 13],
    ],
    columns=['Age', 'Name', 'OS', 'Bearded', 'Gender', 'Smoking', 'Savings', 'Country', 'Quote', 'id'],
)

In [9]:
jan_train

Unnamed: 0,Age,Name,OS,Bearded,Gender,Smoking,Savings,Country,Quote,id
0,23,Jo,Android,True,M,Smoking,0.07,USA,Living life to its fullest,7
1,52,Regina,,True,F,Non-Smoking,0.26,Germany,I hate cats,2
2,23,Dana,,True,F,Smoking,0.3,USA,the apen is mightier then the sword,12
3,25,Bo,,False,M,Non-Smoking,2.3,Greece,all for one and one for all,8
4,80,Richy,iOS,True,M,Non-Smoking,100.2,Finland,I gots the dollarz,5
5,60,Paul,,True,M,Smoking,1.87,Denmark,blah,9
6,44,Derek,,False,M,Smoking,1.1,Denmark,every life is precious,10
7,72,Regina,,False,F,Smoking,7.1,Greece,all of you get off my porch,3
8,50,Jim,,False,M,Non-Smoking,0.2,Germany,boy do I love dogs and cats,6
9,80,Wealthus,iOS,False,F,Non-Smoking,123.2,Finland,me likey them moniez,19


In [10]:
jan_train_X = jan_train.loc[:, jan_train.columns != LBL_COL]
jan_train_y = jan_train[LBL_COL]

In [11]:
jan_train_X_post, jan_train_y_post = mp.pipeline.fit_transform(jan_train_X, jan_train_y, verbose=True)

- Drop columns Columns with at least 0.2 missing value rate
Dropping columns OS
- Drop labels by values
- Encode label values
- Drop columns 'Name'
Dropping columns Name
- set_index: Apply dataframe method set_index with kwargs {'keys': 'id'}
- Drop rows by qualifier <RowQualifier: Qualify rows with X[Savings] >
  101>
1 rows dropped.
- Assign column Viking with df[Country].isin(['Denmark', 'Finland']) &
  ~df[Bearded]
- Assign column YearlyGrands with df[Savings] * 1000 / df[Age]
- Bin Savings by [1].


  0%|          | 0/1 [00:00<?, ?it/s]

- One-hot encode 'Country'


  0%|          | 0/1 [00:00<?, ?it/s]

- Tokenize Quote
- Stemming tokens in Quote...
- Remove stopwords from Quote
- Count-vectorizing column Quote.
- Encode 'Savings_bin', 'Gender'


  0%|          | 0/2 [00:00<?, ?it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>
- Drop columns 'Bearded'
Dropping columns Bearded
- Transform input dataframes to the following schema: <Learnable Schema>
- Validates conditions


In [12]:
jan_train_X_post

Unnamed: 0_level_0,Age,Gender,Savings,Savings_bin,Viking,YearlyGrands,Country_Finland,Country_Germany,Country_Greece,Country_USA,...,Quote_got,Quote_hate,Quote_life,Quote_live,Quote_love,Quote_mightier,Quote_one,Quote_porch,Quote_precious,Quote_sword
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,-1.240591,0.707107,-0.403662,1.118034,False,-0.432587,-0.353553,-0.534522,-0.534522,1.870829,...,-0.353553,-0.353553,1.870829,2.828427,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
2,0.217942,-1.414214,-0.397541,1.118034,False,-0.427498,-0.353553,1.870829,-0.534522,-0.534522,...,-0.353553,2.828427,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
12,-1.240591,-1.414214,-0.396252,1.118034,False,-0.406579,-0.353553,-0.534522,-0.534522,1.870829,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,2.828427,-0.353553,-0.353553,-0.353553,2.828427
8,-1.140003,0.707107,-0.331821,-0.894427,False,-0.201232,-0.353553,-0.534522,1.870829,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,2.828427,-0.353553,-0.353553,-0.353553
5,1.626181,0.707107,2.822088,-0.894427,False,2.816961,2.828427,-0.534522,-0.534522,-0.534522,...,2.828427,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
9,0.620296,0.707107,-0.345674,-0.894427,False,-0.359445,-0.353553,-0.534522,-0.534522,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
10,-0.184412,0.707107,-0.37048,-0.894427,True,-0.375483,-0.353553,-0.534522,-0.534522,-0.534522,...,-0.353553,-0.353553,1.870829,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,2.828427,-0.353553
3,1.223827,-1.414214,-0.177186,-0.894427,False,-0.184038,-0.353553,-0.534522,1.870829,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,2.828427,-0.353553,-0.353553
6,0.117353,0.707107,-0.399474,1.118034,False,-0.430099,-0.353553,1.870829,-0.534522,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,2.828427,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553


In [13]:
jan_train_X_post

Unnamed: 0_level_0,Age,Gender,Savings,Savings_bin,Viking,YearlyGrands,Country_Finland,Country_Germany,Country_Greece,Country_USA,...,Quote_got,Quote_hate,Quote_life,Quote_live,Quote_love,Quote_mightier,Quote_one,Quote_porch,Quote_precious,Quote_sword
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,-1.240591,0.707107,-0.403662,1.118034,False,-0.432587,-0.353553,-0.534522,-0.534522,1.870829,...,-0.353553,-0.353553,1.870829,2.828427,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
2,0.217942,-1.414214,-0.397541,1.118034,False,-0.427498,-0.353553,1.870829,-0.534522,-0.534522,...,-0.353553,2.828427,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
12,-1.240591,-1.414214,-0.396252,1.118034,False,-0.406579,-0.353553,-0.534522,-0.534522,1.870829,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,2.828427,-0.353553,-0.353553,-0.353553,2.828427
8,-1.140003,0.707107,-0.331821,-0.894427,False,-0.201232,-0.353553,-0.534522,1.870829,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,2.828427,-0.353553,-0.353553,-0.353553
5,1.626181,0.707107,2.822088,-0.894427,False,2.816961,2.828427,-0.534522,-0.534522,-0.534522,...,2.828427,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
9,0.620296,0.707107,-0.345674,-0.894427,False,-0.359445,-0.353553,-0.534522,-0.534522,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
10,-0.184412,0.707107,-0.37048,-0.894427,True,-0.375483,-0.353553,-0.534522,-0.534522,-0.534522,...,-0.353553,-0.353553,1.870829,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,2.828427,-0.353553
3,1.223827,-1.414214,-0.177186,-0.894427,False,-0.184038,-0.353553,-0.534522,1.870829,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,2.828427,-0.353553,-0.353553
6,0.117353,0.707107,-0.399474,1.118034,False,-0.430099,-0.353553,1.870829,-0.534522,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,2.828427,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553


In [14]:
mp.fit(jan_train_X, jan_train_y);

In [15]:
jan_test = pd.DataFrame(
    data=[
        [60, 2.4, 'Bo', None, True, 'F', 'Smoking', 2.7, 'Brazil', 'Living by the sword', 30],
        [22, 3.1, 'Janet', 'iOS', True, 'F', 'Non-Smoking', 20.36, 'Greece', 'I love cats', 31],
    ],
    columns=['Age', 'ph', 'Name', 'OS', 'Bearded', 'Gender', 'Smoking', 'Savings', 'Country', 'Quote', 'id'],
)
jan_test_X = jan_test.loc[:, jan_test.columns != LBL_COL]
jan_test_y = jan_test[LBL_COL]

In [16]:
from pdpipe.util import LabelPlaceholderForPredict
mp.pipeline[0:9].transform(jan_test_X, LabelPlaceholderForPredict(jan_test_X))

(    Age   ph  Bearded Gender  Savings Savings_bin Country  \
 id                                                          
 30   60  2.4     True      F     2.70          1≤  Brazil   
 31   22  3.1     True      F    20.36          1≤  Greece   
 
                   Quote  Viking  YearlyGrands  
 id                                             
 30  Living by the sword   False     45.000000  
 31          I love cats   False    925.454545  ,
 id
 30    __pdpipe_lbl_pholder_predict__
 31    __pdpipe_lbl_pholder_predict__
 dtype: object)

In [17]:
mp.predict(jan_test_X)

array([1, 0])

In [18]:
# jan_train_X_post, jan_train_y_post = mp.pipeline[0:7].transform(jan_train_X, jan_train_y, verbose=True)

In [19]:
# jan_train_X_post

## February

In [20]:
feb_train = pd.DataFrame(
    data=[
        [66, 'Go', 'Android', True, 'M', 'Smoking', 0.07, 'USA', "Let not my cold words here accuse my zeal: 'Tis not the trial of a woman's war, The bitter clamour of two eager tongues, Can arbitrate this cause betwixt us twain; The blood is hot that must be cool'd for this: Yet can I not of such tame patience boast As to be hush'd and nought at all to say: First, the fair reverence of your highness curbs me From giving reins and spurs to my free speech; Which else would post until it had return'd These terms of treason doubled down his throat. Setting aside his high blood's royalty, And let him be no kinsman to my liege, I do defy him, and I spit at him; Call him a slanderous coward and a villain: Which to maintain I would allow him odds, And meet him, were I tied to run afoot Even to the frozen ridges of the Alps, Or any other ground inhabitable, Where ever Englishman durst set his foot. Mean time let this defend my loyalty, By all my hopes, most falsely doth he lie.", 40],
        [12, 'Brienne', None, True, 'F', 'Non-Smoking', 0.26, 'Germany', 'I hate cats', 41],

    ],
    columns=['Age', 'Name', 'OS', 'Bearded', 'Gender', 'Smoking', 'Savings', 'Country', 'Quote', 'id'],
)

In [21]:
feb_train

Unnamed: 0,Age,Name,OS,Bearded,Gender,Smoking,Savings,Country,Quote,id
0,66,Go,Android,True,M,Smoking,0.07,USA,Let not my cold words here accuse my zeal: 'Ti...,40
1,12,Brienne,,True,F,Non-Smoking,0.26,Germany,I hate cats,41


In [22]:
feb_train_X = feb_train.loc[:, feb_train.columns != LBL_COL]
feb_train_y = feb_train[LBL_COL]

In [23]:
mp.pipeline.fit_transform(feb_train_X, feb_train_y, verbose=True)[0].dtypes

- Drop columns Columns with at least 0.2 missing value rate
Dropping columns OS
- Drop labels by values
- Encode label values
- Drop columns 'Name'
Dropping columns Name
- set_index: Apply dataframe method set_index with kwargs {'keys': 'id'}
- Drop rows by qualifier <RowQualifier: Qualify rows with X[Savings] >
  101>
0 rows dropped.
- Assign column Viking with df[Country].isin(['Denmark', 'Finland']) &
  ~df[Bearded]
- Assign column YearlyGrands with df[Savings] * 1000 / df[Age]
- Bin Savings by [1].


  0%|          | 0/1 [00:00<?, ?it/s]

- One-hot encode 'Country'


  0%|          | 0/1 [00:00<?, ?it/s]

- Tokenize Quote
- Stemming tokens in Quote...
- Remove stopwords from Quote
- Count-vectorizing column Quote.
- Encode 'Savings_bin', 'Gender'


  0%|          | 0/2 [00:00<?, ?it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>
- Drop columns 'Bearded'
Dropping columns Bearded
- Transform input dataframes to the following schema: <Learnable Schema>
- Validates conditions


Age            float64
Gender         float64
Savings        float64
Savings_bin    float64
Viking            bool
                ...   
Quote_woman    float64
Quote_word     float64
Quote_would    float64
Quote_yet      float64
Quote_zeal     float64
Length: 100, dtype: object

In [24]:
mp.pipeline.fit_transform(feb_train_X, feb_train_y, verbose=True)[0]

- Drop columns Columns with at least 0.2 missing value rate
Dropping columns OS
- Drop labels by values
- Encode label values
- Drop columns 'Name'
Dropping columns Name
- set_index: Apply dataframe method set_index with kwargs {'keys': 'id'}
- Drop rows by qualifier <RowQualifier: Qualify rows with X[Savings] >
  101>
0 rows dropped.
- Assign column Viking with df[Country].isin(['Denmark', 'Finland']) &
  ~df[Bearded]
- Assign column YearlyGrands with df[Savings] * 1000 / df[Age]
- Bin Savings by [1].


  0%|          | 0/1 [00:00<?, ?it/s]

- One-hot encode 'Country'


  0%|          | 0/1 [00:00<?, ?it/s]

- Tokenize Quote
- Stemming tokens in Quote...
- Remove stopwords from Quote
- Count-vectorizing column Quote.
- Encode 'Savings_bin', 'Gender'


  0%|          | 0/2 [00:00<?, ?it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>
- Drop columns 'Bearded'
Dropping columns Bearded
- Transform input dataframes to the following schema: <Learnable Schema>
- Validates conditions


Unnamed: 0_level_0,Age,Gender,Savings,Savings_bin,Viking,YearlyGrands,Country_USA,Quote_'d,Quote_'s,Quote_'t,...,Quote_twain,Quote_two,Quote_us,Quote_villain,Quote_war,Quote_woman,Quote_word,Quote_would,Quote_yet,Quote_zeal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40,1.0,1.0,-1.0,0.0,False,-1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
41,-1.0,-1.0,1.0,0.0,False,1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [25]:
feb_train_X_post, feb_train_y_post = mp.pipeline.fit_transform(feb_train_X, feb_train_y, verbose=True)

- Drop columns Columns with at least 0.2 missing value rate
Dropping columns OS
- Drop labels by values
- Encode label values
- Drop columns 'Name'
Dropping columns Name
- set_index: Apply dataframe method set_index with kwargs {'keys': 'id'}
- Drop rows by qualifier <RowQualifier: Qualify rows with X[Savings] >
  101>
0 rows dropped.
- Assign column Viking with df[Country].isin(['Denmark', 'Finland']) &
  ~df[Bearded]
- Assign column YearlyGrands with df[Savings] * 1000 / df[Age]
- Bin Savings by [1].


  0%|          | 0/1 [00:00<?, ?it/s]

- One-hot encode 'Country'


  0%|          | 0/1 [00:00<?, ?it/s]

- Tokenize Quote
- Stemming tokens in Quote...
- Remove stopwords from Quote
- Count-vectorizing column Quote.
- Encode 'Savings_bin', 'Gender'


  0%|          | 0/2 [00:00<?, ?it/s]

- Scale columns Columns of dtypes <class 'numpy.number'>
- Drop columns 'Bearded'
Dropping columns Bearded
- Transform input dataframes to the following schema: <Learnable Schema>
- Validates conditions


In [26]:
feb_train_X_post.iloc[0].values

array([1.0, 1.0, -1.0, 0.0, False, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=object)

## GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
gcv = GridSearchCV(
    estimator=mp,
    param_grid={
        'savings_max_val': [99, 101],
        'standardize': [True, False],
        'drop_gender': [True, False],
        'ohencode_country': [True, False],
    },
    cv=3,
)

In [29]:
gcv

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'standardize': [True, False]})

In [30]:
all_x = jan_train_X
all_y = jan_train_y

In [31]:
all_x

Unnamed: 0,Age,Name,OS,Bearded,Gender,Savings,Country,Quote,id
0,23,Jo,Android,True,M,0.07,USA,Living life to its fullest,7
1,52,Regina,,True,F,0.26,Germany,I hate cats,2
2,23,Dana,,True,F,0.3,USA,the apen is mightier then the sword,12
3,25,Bo,,False,M,2.3,Greece,all for one and one for all,8
4,80,Richy,iOS,True,M,100.2,Finland,I gots the dollarz,5
5,60,Paul,,True,M,1.87,Denmark,blah,9
6,44,Derek,,False,M,1.1,Denmark,every life is precious,10
7,72,Regina,,False,F,7.1,Greece,all of you get off my porch,3
8,50,Jim,,False,M,0.2,Germany,boy do I love dogs and cats,6
9,80,Wealthus,iOS,False,F,123.2,Finland,me likey them moniez,19


In [32]:
all_x.shape

(11, 9)

In [33]:
all_y.shape

(11,)

In [34]:
mp.score

<bound method PdPipelineAndSklearnEstimator.score of <PdPipeline -> LogisticRegression>>

In [35]:
gcv.fit(all_x, all_y)



GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'standardize': [True, False]})

In [36]:
gcv

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'standardize': [True, False]})

In [37]:
gcv.cv_results_

{'mean_fit_time': array([0.036316  , 0.03083754, 0.03078898, 0.03416101, 0.04521569,
        0.03324334, 0.03493333, 0.03789465, 0.04669118, 0.03550259,
        0.03185336, 0.03053729, 0.03288166, 0.03634501, 0.02878777,
        0.03063146]),
 'std_fit_time': array([0.0055919 , 0.00169539, 0.00341967, 0.0038277 , 0.0218201 ,
        0.00401087, 0.00067782, 0.00617299, 0.0045382 , 0.002646  ,
        0.00085212, 0.00154722, 0.00841186, 0.00198166, 0.00192767,
        0.00563191]),
 'mean_score_time': array([0.02412669, 0.02405405, 0.02856461, 0.02712154, 0.02272058,
        0.0282437 , 0.02952139, 0.03249137, 0.03796196, 0.0275046 ,
        0.02630679, 0.02668047, 0.02639675, 0.02774509, 0.02319964,
        0.02581008]),
 'std_score_time': array([0.00063052, 0.00158031, 0.00502455, 0.00163232, 0.00055962,
        0.00200398, 0.0020983 , 0.00493913, 0.00482785, 0.00042132,
        0.00130141, 0.00296123, 0.00346143, 0.00077225, 0.00221812,
        0.00401874]),
 'param_drop_gender': mask

In [38]:
gcv.best_estimator_

<PdPipeline -> LogisticRegression>

In [39]:
gcv.best_score_

0.5277777777777778

In [40]:
gcv.best_params_

{'drop_gender': True,
 'ohencode_country': True,
 'savings_max_val': 99,
 'standardize': True}

In [41]:
post_x, post_y = mp.pipeline.fit_transform(all_x, all_y)
post_x

Unnamed: 0_level_0,Age,Gender,Savings,Savings_bin,Viking,YearlyGrands,Country_Finland,Country_Germany,Country_Greece,Country_USA,...,Quote_got,Quote_hate,Quote_life,Quote_live,Quote_love,Quote_mightier,Quote_one,Quote_porch,Quote_precious,Quote_sword
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,-1.240591,0.707107,-0.403662,1.118034,False,-0.432587,-0.353553,-0.534522,-0.534522,1.870829,...,-0.353553,-0.353553,1.870829,2.828427,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
2,0.217942,-1.414214,-0.397541,1.118034,False,-0.427498,-0.353553,1.870829,-0.534522,-0.534522,...,-0.353553,2.828427,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
12,-1.240591,-1.414214,-0.396252,1.118034,False,-0.406579,-0.353553,-0.534522,-0.534522,1.870829,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,2.828427,-0.353553,-0.353553,-0.353553,2.828427
8,-1.140003,0.707107,-0.331821,-0.894427,False,-0.201232,-0.353553,-0.534522,1.870829,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,2.828427,-0.353553,-0.353553,-0.353553
5,1.626181,0.707107,2.822088,-0.894427,False,2.816961,2.828427,-0.534522,-0.534522,-0.534522,...,2.828427,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
9,0.620296,0.707107,-0.345674,-0.894427,False,-0.359445,-0.353553,-0.534522,-0.534522,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553
10,-0.184412,0.707107,-0.37048,-0.894427,True,-0.375483,-0.353553,-0.534522,-0.534522,-0.534522,...,-0.353553,-0.353553,1.870829,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553,2.828427,-0.353553
3,1.223827,-1.414214,-0.177186,-0.894427,False,-0.184038,-0.353553,-0.534522,1.870829,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,-0.353553,-0.353553,-0.353553,2.828427,-0.353553,-0.353553
6,0.117353,0.707107,-0.399474,1.118034,False,-0.430099,-0.353553,1.870829,-0.534522,-0.534522,...,-0.353553,-0.353553,-0.534522,-0.353553,2.828427,-0.353553,-0.353553,-0.353553,-0.353553,-0.353553


In [42]:
post_x.values

array([[-1.2405913475081491, 0.7071067811865476, -0.4036616246372651,
        1.118033988749895, False, -0.4325869193391223,
        -0.3535533905932738, -0.5345224838248487, -0.5345224838248487,
        1.8708286933869707, -0.3535533905932738, -0.3535533905932738,
        -0.35355339059327373, -0.522411943359549, -0.35355339059327373,
        -0.35355339059327373, -0.35355339059327373, 2.82842712474619,
        -0.35355339059327373, -0.35355339059327373, -0.3535533905932738,
        1.8708286933869707, 2.82842712474619, -0.35355339059327373,
        -0.3535533905932738, -0.3535533905932738, -0.35355339059327373,
        -0.35355339059327373, -0.3535533905932738],
       [0.2179417232108912, -1.414213562373095, -0.3975406582620791,
        1.118033988749895, False, -0.42749845801219016,
        -0.3535533905932738, 1.8708286933869707, -0.5345224838248487,
        -0.5345224838248487, -0.3535533905932738, -0.3535533905932738,
        -0.35355339059327373, 2.277441515128002, -0.353553390

In [43]:
post_x.values.shape

(9, 29)

In [44]:
all_y.values

array(['Smoking', 'Non-Smoking', 'Smoking', 'Non-Smoking', 'Non-Smoking',
       'Smoking', 'Smoking', 'Smoking', 'Non-Smoking', 'Non-Smoking',
       'Breathing'], dtype=object)

In [45]:
len(all_y.values)

11

In [46]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [47]:
from pdpipe.skintegrate import pdpipe_scorer_from_sklearn_scorer

In [48]:
my_scorer = pdpipe_scorer_from_sklearn_scorer(ftwo_scorer)

In [49]:
my_scorer

<PdPipeScorer: make_scorer(fbeta_score, beta=2)>

In [50]:
gcv = GridSearchCV(
    estimator=mp,
    param_grid={
        'savings_max_val': [99, 101],
        'standardize': [True, False],
        'drop_gender': [True, False],
        'ohencode_country': [True, False],
    },
    cv=3,
    scoring=my_scorer,
)

In [51]:
gcv.fit(all_x, all_y)



GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'standardize': [True, False]},
             scoring=<PdPipeScorer: make_scorer(fbeta_score, beta=2)>)

In [52]:
gcv.best_score_

0.4696969696969697

In [53]:
gcv.cv_results_

{'mean_fit_time': array([0.05307118, 0.03644673, 0.03337034, 0.0356098 , 0.04166571,
        0.03931141, 0.02765512, 0.02972627, 0.03123562, 0.0338273 ,
        0.03886   , 0.03189699, 0.02733254, 0.02444871, 0.02577432,
        0.0279762 ]),
 'std_fit_time': array([0.00924213, 0.00277671, 0.00207793, 0.00062841, 0.00383829,
        0.00626934, 0.00082305, 0.00084342, 0.00091409, 0.00686685,
        0.00311812, 0.00268891, 0.00068973, 0.0008484 , 0.00182553,
        0.00483225]),
 'mean_score_time': array([0.04330166, 0.03260907, 0.02891628, 0.03044597, 0.03565907,
        0.0308636 , 0.02348264, 0.02447963, 0.02342367, 0.02678943,
        0.03016472, 0.02501202, 0.0223074 , 0.02032693, 0.02026264,
        0.02425075]),
 'std_score_time': array([0.00340723, 0.0050149 , 0.00104689, 0.00050366, 0.00299653,
        0.00662635, 0.00075628, 0.0011864 , 0.0006002 , 0.0031599 ,
        0.00213231, 0.00278666, 0.0004076 , 0.00061872, 0.00072998,
        0.0057804 ]),
 'param_drop_gender': mask