In [1]:
from typing import Optional

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
import pdpipe as pdp
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

In [4]:
# pdp.Bin?

In [5]:
class MyPipelineAndModel(PdPipelineAndSklearnEstimator):
    
    def __init__(
        self,
        savings_max_val: Optional[int] = 100,
        drop_gender: Optional[bool] = False,
        scale_numeric: Optional[bool] = False,
        ohencode_country: Optional[bool] = True,
        savings_bin_val: Optional[int] = None,
        fit_intercept: Optional[bool] = True,
    ):
        self.savings_max_val = savings_max_val
        self.drop_gender = drop_gender
        self.scale_numeric = scale_numeric
        self.ohencode_country = ohencode_country
        self.savings_bin_val = savings_bin_val
        self.fit_intercept = fit_intercept
        cols_to_drop = []
        stages = [
            pdp.ColDrop(['Name', 'Quote'], errors='ignore'),
#             pdp.RowDrop({'Savings': lambda x: x > savings_max_val}),
        ]
        if savings_bin_val:
            stages.append(pdp.Bin({'Savings': [savings_bin_val]}, drop=False))
            stages.append(pdp.Encode('Savings_bin'))
        if scale_numeric:
            stages.append(pdp.Scale('MinMaxScaler'))
        if drop_gender:
            cols_to_drop.append('Gender')
        else:
            stages.append(pdp.Encode('Gender'))
        if ohencode_country:
            stages.append(pdp.OneHotEncode('Country'))
        else:
            cols_to_drop.append('Country')
        stages.append(pdp.ColDrop(cols_to_drop, errors='ignore'))
        pline = pdp.PdPipeline(stages)
        model = LogisticRegression(fit_intercept=fit_intercept)
        super().__init__(pipeline=pline, estimator=model)

In [6]:
mp = MyPipelineAndModel(
    savings_max_val=101,
    drop_gender=True,
    scale_numeric=True,
    ohencode_country=True,
    savings_bin_val=1,
    fit_intercept=True,
)

In [7]:
mp

<PdPipeline -> LogisticRegression>

In [8]:
mp.pipeline

A pdpipe pipeline:
[ 0]  Drop columns Name, Quote
[ 1]  Bin Savings by [1].
[ 2]  Encode Savings_bin
[ 3]  Scale columns Columns of dtypes <class 'numpy.number'>
[ 4]  One-hot encode Country
[ 5]  Drop columns Gender

In [9]:
mp.estimator

LogisticRegression()

In [10]:
mp.score?

[0;31mSignature:[0m [0mmp[0m[0;34m.[0m[0mscore[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/clones/pdpipe/pdpipe/skintegrate.py
[0;31mType:[0m      method


## Test our custom estimator checks out

In [11]:
from sklearn.utils.estimator_checks import check_estimator

In [12]:
# check_estimator(mp)

## Train-test for the pipeline

In [13]:
import pandas as pd

In [14]:
df = pd.DataFrame(
    data=[
        [23, 'Jo', 'M', True, 0.07, 'USA', 'Living life to its fullest'],
        [52, 'Regina', 'F', False, 0.26, 'Germany', 'I hate cats'],
        [23, 'Dana', 'F', True, 0.3, 'USA', 'the pen is mightier then the sword'],
        [25, 'Bo', 'M', False, 2.3, 'Greece', 'all for one and one for all'],
        [80, 'Richy', 'M', False, 100.2, 'Finland', 'I gots the dollarz'],
        [60, 'Paul', 'M', True, 1.87, 'Denmark', 'blah'],
        [44, 'Derek', 'M', True, 1.1, 'Denmark', 'every life is precious'],
        [72, 'Regina', 'F', True, 7.1, 'Greece', 'all of you get off my porch'],
        [50, 'Jim', 'M', False, 0.2, 'Germany', 'boy do I love dogs and cats'],
        [80, 'Wealthus', 'F', False, 123.2, 'Finland', 'me likey them moniez'],
    ],
    columns=['Age', 'Name', 'Gender', 'Smoking', 'Savings', 'Country', 'Quote'],
)

In [15]:
df

Unnamed: 0,Age,Name,Gender,Smoking,Savings,Country,Quote
0,23,Jo,M,True,0.07,USA,Living life to its fullest
1,52,Regina,F,False,0.26,Germany,I hate cats
2,23,Dana,F,True,0.3,USA,the pen is mightier then the sword
3,25,Bo,M,False,2.3,Greece,all for one and one for all
4,80,Richy,M,False,100.2,Finland,I gots the dollarz
5,60,Paul,M,True,1.87,Denmark,blah
6,44,Derek,M,True,1.1,Denmark,every life is precious
7,72,Regina,F,True,7.1,Greece,all of you get off my porch
8,50,Jim,M,False,0.2,Germany,boy do I love dogs and cats
9,80,Wealthus,F,False,123.2,Finland,me likey them moniez


In [16]:
mp.pipeline(df)

Unnamed: 0,Age,Smoking,Savings,Savings_bin,Country_Finland,Country_Germany,Country_Greece,Country_USA
0,0.0,True,0.0,1.0,0,0,0,1
1,0.508772,False,0.001543,1.0,0,1,0,0
2,0.0,True,0.001868,1.0,0,0,0,1
3,0.035088,False,0.018111,0.0,0,0,1,0
4,1.0,False,0.813206,0.0,1,0,0,0
5,0.649123,True,0.014619,0.0,0,0,0,0
6,0.368421,True,0.008365,0.0,0,0,0,0
7,0.859649,True,0.057094,0.0,0,0,1,0
8,0.473684,False,0.001056,1.0,0,1,0,0
9,1.0,False,1.0,0.0,1,0,0,0


In [17]:
mp.pipeline[0:4](df)

Unnamed: 0,Age,Gender,Smoking,Savings,Savings_bin,Country
0,0.0,M,True,0.0,1.0,USA
1,0.508772,F,False,0.001543,1.0,Germany
2,0.0,F,True,0.001868,1.0,USA
3,0.035088,M,False,0.018111,0.0,Greece
4,1.0,M,False,0.813206,0.0,Finland
5,0.649123,M,True,0.014619,0.0,Denmark
6,0.368421,M,True,0.008365,0.0,Denmark
7,0.859649,F,True,0.057094,0.0,Greece
8,0.473684,M,False,0.001056,1.0,Germany
9,1.0,F,False,1.0,0.0,Finland


In [18]:
x_lbls = ['Age', 'Gender', 'Savings', 'Country']

In [19]:
all_x = df[x_lbls]
all_y = df['Smoking']

In [20]:
train_df = df.iloc[0:6]
train_df

Unnamed: 0,Age,Name,Gender,Smoking,Savings,Country,Quote
0,23,Jo,M,True,0.07,USA,Living life to its fullest
1,52,Regina,F,False,0.26,Germany,I hate cats
2,23,Dana,F,True,0.3,USA,the pen is mightier then the sword
3,25,Bo,M,False,2.3,Greece,all for one and one for all
4,80,Richy,M,False,100.2,Finland,I gots the dollarz
5,60,Paul,M,True,1.87,Denmark,blah


In [21]:
train_x = train_df[x_lbls]
train_x

Unnamed: 0,Age,Gender,Savings,Country
0,23,M,0.07,USA
1,52,F,0.26,Germany
2,23,F,0.3,USA
3,25,M,2.3,Greece
4,80,M,100.2,Finland
5,60,M,1.87,Denmark


In [22]:
train_y = train_df['Smoking']
train_y

0     True
1    False
2     True
3    False
4    False
5     True
Name: Smoking, dtype: bool

In [23]:
test_df = df.iloc[6:]
test_df

Unnamed: 0,Age,Name,Gender,Smoking,Savings,Country,Quote
6,44,Derek,M,True,1.1,Denmark,every life is precious
7,72,Regina,F,True,7.1,Greece,all of you get off my porch
8,50,Jim,M,False,0.2,Germany,boy do I love dogs and cats
9,80,Wealthus,F,False,123.2,Finland,me likey them moniez


In [24]:
test_x = test_df[x_lbls]
test_x

Unnamed: 0,Age,Gender,Savings,Country
6,44,M,1.1,Denmark
7,72,F,7.1,Greece
8,50,M,0.2,Germany
9,80,F,123.2,Finland


In [25]:
test_y = test_df['Smoking']
test_y

6     True
7     True
8    False
9    False
Name: Smoking, dtype: bool

In [26]:
mp.pipeline.fit_transform(train_x)

Unnamed: 0,Age,Savings,Savings_bin,Country_Finland,Country_Germany,Country_Greece,Country_USA
0,0.0,0.0,1.0,0,0,0,1
1,0.508772,0.001898,1.0,0,1,0,0
2,0.0,0.002297,1.0,0,0,0,1
3,0.035088,0.022271,0.0,0,0,1,0
4,1.0,1.0,0.0,1,0,0,0
5,0.649123,0.017977,0.0,0,0,0,0


In [27]:
mp.pipeline.transform(test_x)

Unnamed: 0,Age,Savings,Savings_bin,Country_Finland,Country_Germany,Country_Greece,Country_USA
6,0.368421,0.010287,0.0,0,0,0,0
7,0.859649,0.070209,0.0,0,0,1,0
8,0.473684,0.001298,1.0,0,1,0,0
9,1.0,1.229701,0.0,1,0,0,0


## GridSearchCV

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
gcv = GridSearchCV(
    estimator=mp,
    param_grid={
        'savings_max_val': [99, 101],
        'scale_numeric': [True, False],
        'drop_gender': [True, False],
        'ohencode_country': [True, False],
    },
    cv=3,
)

In [30]:
gcv

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]})

In [31]:
all_x

Unnamed: 0,Age,Gender,Savings,Country
0,23,M,0.07,USA
1,52,F,0.26,Germany
2,23,F,0.3,USA
3,25,M,2.3,Greece
4,80,M,100.2,Finland
5,60,M,1.87,Denmark
6,44,M,1.1,Denmark
7,72,F,7.1,Greece
8,50,M,0.2,Germany
9,80,F,123.2,Finland


In [32]:
all_x.shape

(10, 4)

In [33]:
all_y.shape

(10,)

In [34]:
mp.score

<bound method PdPipelineAndSklearnEstimator.score of <PdPipeline -> LogisticRegression>>

In [35]:
gcv.fit(all_x, all_y)

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]})

In [36]:
gcv

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]})

In [37]:
gcv.cv_results_

{'mean_fit_time': array([0.01815104, 0.01288565, 0.01307678, 0.01633612, 0.01801427,
        0.015625  , 0.0142889 , 0.01253605, 0.01318653, 0.012911  ,
        0.01201495, 0.01258262, 0.01142033, 0.01253661, 0.01196504,
        0.01268951]),
 'std_fit_time': array([0.00575958, 0.00057629, 0.00064679, 0.00171021, 0.00100924,
        0.00083239, 0.0008318 , 0.00026177, 0.00027617, 0.00019952,
        0.00040193, 0.0005715 , 0.00017451, 0.00065186, 0.00025348,
        0.00042232]),
 'mean_score_time': array([0.0115246 , 0.01162108, 0.010921  , 0.01299373, 0.0133721 ,
        0.01191195, 0.01176174, 0.0114522 , 0.00978629, 0.00991448,
        0.00967471, 0.01002359, 0.00972072, 0.01012206, 0.00953722,
        0.01004601]),
 'std_score_time': array([2.36995364e-03, 1.53212151e-03, 4.24461679e-04, 1.71364529e-03,
        1.24526186e-03, 8.97675224e-04, 1.21555967e-04, 2.10730286e-04,
        6.74085189e-04, 4.28284203e-04, 4.24088566e-04, 3.80880576e-04,
        1.01367687e-04, 9.37618764e-

In [38]:
gcv.best_estimator_

<PdPipeline -> LogisticRegression>

In [39]:
gcv.best_score_

0.5833333333333334

In [40]:
gcv.best_params_

{'drop_gender': True,
 'ohencode_country': True,
 'savings_max_val': 99,
 'scale_numeric': True}

In [41]:
post_x = mp.pipeline.fit_transform(all_x)
post_x

Unnamed: 0,Age,Savings,Savings_bin,Country_Finland,Country_Germany,Country_Greece,Country_USA
0,0.0,0.0,1.0,0,0,0,1
1,0.508772,0.001543,1.0,0,1,0,0
2,0.0,0.001868,1.0,0,0,0,1
3,0.035088,0.018111,0.0,0,0,1,0
4,1.0,0.813206,0.0,1,0,0,0
5,0.649123,0.014619,0.0,0,0,0,0
6,0.368421,0.008365,0.0,0,0,0,0
7,0.859649,0.057094,0.0,0,0,1,0
8,0.473684,0.001056,1.0,0,1,0,0
9,1.0,1.0,0.0,1,0,0,0


In [42]:
post_x.values

array([[0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 1.        ],
       [0.50877193, 0.00154308, 1.        , 0.        , 1.        ,
        0.        , 0.        ],
       [0.        , 0.00186794, 1.        , 0.        , 0.        ,
        0.        , 1.        ],
       [0.03508772, 0.01811094, 0.        , 0.        , 0.        ,
        1.        , 0.        ],
       [1.        , 0.81320556, 0.        , 1.        , 0.        ,
        0.        , 0.        ],
       [0.64912281, 0.0146187 , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.36842105, 0.00836514, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.85964912, 0.05709413, 0.        , 0.        , 0.        ,
        1.        , 0.        ],
       [0.47368421, 0.00105579, 1.        , 0.        , 1.        ,
        0.        , 0.        ],
       [1.        , 1.        , 0.        , 1.        , 0.        ,
        0.        , 0. 

In [43]:
post_x.values.shape

(10, 7)

In [44]:
all_y.values

array([ True, False,  True, False, False,  True,  True,  True, False,
       False])

In [45]:
len(all_y.values)

10

In [46]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [47]:
from pdpipe.skintegrate import pdpipe_scorer_from_sklearn_scorer

In [48]:
my_scorer = pdpipe_scorer_from_sklearn_scorer(ftwo_scorer)

In [49]:
my_scorer

<PdPipeScorer: make_scorer(fbeta_score, beta=2)

In [50]:
gcv = GridSearchCV(
    estimator=mp,
    param_grid={
        'savings_max_val': [99, 101],
        'scale_numeric': [True, False],
        'drop_gender': [True, False],
        'ohencode_country': [True, False],
    },
    cv=3,
    scoring=my_scorer,
)

In [51]:
gcv.fit(all_x, all_y)

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]},
             scoring=<PdPipeScorer: make_scorer(fbeta_score, beta=2))

In [52]:
gcv.best_score_

0.30303030303030304

In [53]:
gcv.cv_results_

{'mean_fit_time': array([0.01948587, 0.01312502, 0.01175737, 0.01658527, 0.01359336,
        0.01327173, 0.01260543, 0.01163673, 0.01098204, 0.01114368,
        0.01069268, 0.01025971, 0.01000635, 0.00990534, 0.01050059,
        0.01194223]),
 'std_fit_time': array([0.00224546, 0.00102073, 0.00052922, 0.00385093, 0.00023676,
        0.00030171, 0.00026745, 0.00060974, 0.00047875, 0.00039744,
        0.00062997, 0.00019351, 0.00042311, 0.0002525 , 0.00027802,
        0.00064901]),
 'mean_score_time': array([0.01376939, 0.01107963, 0.00972724, 0.01281365, 0.01224526,
        0.01156624, 0.01001438, 0.00943136, 0.00912031, 0.00944495,
        0.00869004, 0.00843493, 0.0084157 , 0.00873327, 0.00923006,
        0.00969672]),
 'std_score_time': array([1.96256855e-03, 1.06358230e-03, 3.79745840e-04, 2.25962982e-03,
        9.45426769e-04, 3.89868562e-04, 1.11462039e-04, 1.90760591e-04,
        2.13635671e-04, 1.30183676e-04, 8.00893239e-05, 2.45272456e-04,
        2.64555684e-04, 5.08685953e-