# PdPipeline + Sklearn Model

## The Data

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame(
    data=[
        [23, 'Jo', 'M', True, 0.07, 'USA', 'Living life to its fullest'],
        [52, 'Regina', 'F', False, 0.26, 'Germany', 'I hate cats'],
        [23, 'Dana', 'F', True, 0.3, 'USA', 'the pen is mightier then the sword'],
        [25, 'Bo', 'M', False, 2.3, 'Greece', 'all for one and one for all'],
        [80, 'Richy', 'M', False, 100.2, 'Finland', 'I gots the dollarz'],
        [60, 'Paul', 'M', True, 1.87, 'Denmark', 'blah'],
        [44, 'Derek', 'M', True, 1.1, 'Denmark', 'every life is precious'],
        [72, 'Regina', 'F', True, 7.1, 'Greece', 'all of you get off my porch'],
        [50, 'Jim', 'M', False, 0.2, 'Germany', 'boy do I love dogs and cats'],
        [80, 'Wealthus', 'F', False, 123.2, 'Finland', 'me likey them moniez'],
    ],
    columns=['Age', 'Name', 'Gender', 'Smoking', 'Savings', 'Country', 'Quote'],
)

In [3]:
df

Unnamed: 0,Age,Name,Gender,Smoking,Savings,Country,Quote
0,23,Jo,M,True,0.07,USA,Living life to its fullest
1,52,Regina,F,False,0.26,Germany,I hate cats
2,23,Dana,F,True,0.3,USA,the pen is mightier then the sword
3,25,Bo,M,False,2.3,Greece,all for one and one for all
4,80,Richy,M,False,100.2,Finland,I gots the dollarz
5,60,Paul,M,True,1.87,Denmark,blah
6,44,Derek,M,True,1.1,Denmark,every life is precious
7,72,Regina,F,True,7.1,Greece,all of you get off my porch
8,50,Jim,M,False,0.2,Germany,boy do I love dogs and cats
9,80,Wealthus,F,False,123.2,Finland,me likey them moniez


## Defining a combined object

In [4]:
from typing import Optional

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
import pdpipe as pdp
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

In [7]:
class MyPipelineAndModel(PdPipelineAndSklearnEstimator):
    
    def __init__(
        self,
        savings_max_val: Optional[int] = 100,
        drop_gender: Optional[bool] = False,
        scale_numeric: Optional[bool] = False,
        ohencode_country: Optional[bool] = True,
        savings_bin_val: Optional[int] = None,
        fit_intercept: Optional[bool] = True,
    ):
        self.savings_max_val = savings_max_val
        self.drop_gender = drop_gender
        self.scale_numeric = scale_numeric
        self.ohencode_country = ohencode_country
        self.savings_bin_val = savings_bin_val
        self.fit_intercept = fit_intercept
        cols_to_drop = []
        stages = [
            pdp.ColDrop(['Name', 'Quote'], errors='ignore'),
#             pdp.RowDrop({'Savings': lambda x: x > savings_max_val}),
        ]
        if savings_bin_val:
            stages.append(pdp.Bin({'Savings': [savings_bin_val]}, drop=False))
            stages.append(pdp.Encode('Savings_bin'))
        if scale_numeric:
            stages.append(pdp.Scale('MinMaxScaler'))
        if drop_gender:
            cols_to_drop.append('Gender')
        else:
            stages.append(pdp.Encode('Gender'))
        if ohencode_country:
            stages.append(pdp.OneHotEncode('Country'))
        else:
            cols_to_drop.append('Country')
        stages.append(pdp.ColDrop(cols_to_drop, errors='ignore'))
        pline = pdp.PdPipeline(stages)
        model = LogisticRegression(fit_intercept=fit_intercept)
        super().__init__(pipeline=pline, estimator=model)

In [8]:
mp = MyPipelineAndModel(
    savings_max_val=101,
    drop_gender=True,
    scale_numeric=True,
    ohencode_country=True,
    savings_bin_val=1,
    fit_intercept=True,
)

In [9]:
mp

<PdPipeline -> LogisticRegression>

In [10]:
mp.pipeline

A pdpipe pipeline:
[ 0]  Drop columns Name, Quote
[ 1]  Bin Savings by [1].
[ 2]  Encode Savings_bin
[ 3]  Scale columns Columns of dtypes <class 'numpy.number'>
[ 4]  One-hot encode Country
[ 5]  Drop columns Gender

In [11]:
mp.estimator

LogisticRegression()

In [12]:
mp.score

<bound method PdPipelineAndSklearnEstimator.score of <PdPipeline -> LogisticRegression>>

In [13]:
mp.score?

[0;31mSignature:[0m [0mmp[0m[0;34m.[0m[0mscore[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/clones/pdpipe/pdpipe/skintegrate.py
[0;31mType:[0m      method


In [14]:
mp.pipeline(df)

Unnamed: 0,Age,Smoking,Savings,Savings_bin,Country_Finland,Country_Germany,Country_Greece,Country_USA
0,0.0,True,0.0,1.0,0,0,0,1
1,0.508772,False,0.001543,1.0,0,1,0,0
2,0.0,True,0.001868,1.0,0,0,0,1
3,0.035088,False,0.018111,0.0,0,0,1,0
4,1.0,False,0.813206,0.0,1,0,0,0
5,0.649123,True,0.014619,0.0,0,0,0,0
6,0.368421,True,0.008365,0.0,0,0,0,0
7,0.859649,True,0.057094,0.0,0,0,1,0
8,0.473684,False,0.001056,1.0,0,1,0,0
9,1.0,False,1.0,0.0,1,0,0,0


In [15]:
mp.pipeline[0:4](df)

Unnamed: 0,Age,Gender,Smoking,Savings,Savings_bin,Country
0,0.0,M,True,0.0,1.0,USA
1,0.508772,F,False,0.001543,1.0,Germany
2,0.0,F,True,0.001868,1.0,USA
3,0.035088,M,False,0.018111,0.0,Greece
4,1.0,M,False,0.813206,0.0,Finland
5,0.649123,M,True,0.014619,0.0,Denmark
6,0.368421,M,True,0.008365,0.0,Denmark
7,0.859649,F,True,0.057094,0.0,Greece
8,0.473684,M,False,0.001056,1.0,Germany
9,1.0,F,False,1.0,0.0,Finland


## Test our custom estimator checks out

In [16]:
from sklearn.utils.estimator_checks import check_estimator

Actually, it does not. :(

In [17]:
# check_estimator(mp)

But it is going to work anyway!

## X-y subsets

In [18]:
x_lbls = ['Age', 'Gender', 'Savings', 'Country']

In [19]:
all_x = df[x_lbls]
all_y = df['Smoking']

In [20]:
all_x.shape

(10, 4)

In [21]:
all_y.shape

(10,)

In [22]:
all_x

Unnamed: 0,Age,Gender,Savings,Country
0,23,M,0.07,USA
1,52,F,0.26,Germany
2,23,F,0.3,USA
3,25,M,2.3,Greece
4,80,M,100.2,Finland
5,60,M,1.87,Denmark
6,44,M,1.1,Denmark
7,72,F,7.1,Greece
8,50,M,0.2,Germany
9,80,F,123.2,Finland


## Check inheritence of predict()

In [47]:
mp.fit(all_x, all_y)

<PdPipeline -> LogisticRegression>

In [48]:
mp.predict(all_x)

array([ True, False,  True,  True, False,  True,  True,  True, False,
       False])

In [54]:
mp.predict(all_x).dtype == bool

True

In [49]:
mp.predict_proba(all_x)

array([[0.29801843, 0.70198157],
       [0.63432307, 0.36567693],
       [0.29822225, 0.70177775],
       [0.43059313, 0.56940687],
       [0.68689079, 0.31310921],
       [0.42798384, 0.57201616],
       [0.4167183 , 0.5832817 ],
       [0.46679026, 0.53320974],
       [0.63301878, 0.36698122],
       [0.70744934, 0.29255066]])

In [55]:
mp.predict_proba(all_x).dtype == float

True

In [50]:
mp.predict_log_proba(all_x)

array([[-1.21059993, -0.35384813],
       [-0.45519688, -1.00600504],
       [-1.20991627, -0.35413852],
       [-0.84259165, -0.56316003],
       [-0.37557997, -1.16120322],
       [-0.84866984, -0.55858804],
       [-0.87534482, -0.53908502],
       [-0.76187525, -0.62884042],
       [-0.45725519, -1.00244461],
       [-0.34608926, -1.22911742]])

In [51]:
mp.decision_function(all_x)

array([ 0.8567518 , -0.55080816,  0.85577775,  0.27943162, -0.78562325,
        0.29008181,  0.3362598 ,  0.13303483, -0.54518943, -0.88302815])

In [52]:
mp.score(all_x, all_y)

0.9

In [56]:
mp.classes_

array([False,  True])

## GridSearchCV

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
gcv = GridSearchCV(
    estimator=mp,
    param_grid={
        'savings_max_val': [99, 101],
        'scale_numeric': [True, False],
        'drop_gender': [True, False],
        'ohencode_country': [True, False],
    },
    cv=3,
)

In [25]:
gcv

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]})

In [29]:
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

In [30]:
try:
    check_is_fitted(gcv)
except NotFittedError:
    print("Not fitted - as expected")

Not fitted - as expected


In [31]:
gcv.fit(all_x, all_y)

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]})

In [32]:
assert check_is_fitted(gcv) is None

In [33]:
gcv

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]})

In [34]:
gcv.cv_results_

{'mean_fit_time': array([0.01791286, 0.01304499, 0.01270191, 0.01155297, 0.01158492,
        0.01058435, 0.01024111, 0.0108943 , 0.01247104, 0.01349203,
        0.01421404, 0.01486699, 0.01409276, 0.01352183, 0.01247597,
        0.01110601]),
 'std_fit_time': array([3.00703067e-03, 7.16491515e-04, 9.25654749e-04, 3.68055811e-04,
        5.83948457e-04, 3.18579386e-04, 7.68315560e-05, 3.45401300e-04,
        1.52708786e-03, 5.40897246e-04, 5.76591497e-04, 6.49206519e-04,
        1.17025237e-03, 2.63524214e-04, 9.32107022e-04, 1.92380699e-04]),
 'mean_score_time': array([0.01484005, 0.01101907, 0.01084042, 0.00916457, 0.00932701,
        0.00841204, 0.0082562 , 0.00841435, 0.01073488, 0.01156131,
        0.01045299, 0.01145879, 0.0112594 , 0.01143893, 0.00979233,
        0.00911276]),
 'std_score_time': array([0.00340803, 0.00101021, 0.00116779, 0.00023592, 0.0005183 ,
        0.00020961, 0.00036267, 0.000228  , 0.00192031, 0.00026268,
        0.00047143, 0.00081478, 0.00056503, 0.000549

In [35]:
gcv.best_estimator_

<PdPipeline -> LogisticRegression>

In [36]:
gcv.best_score_

0.5833333333333334

In [37]:
gcv.best_params_

{'drop_gender': True,
 'ohencode_country': True,
 'savings_max_val': 99,
 'scale_numeric': True}

## Working with custom scoring functions

In [38]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [39]:
from pdpipe.skintegrate import pdpipe_scorer_from_sklearn_scorer

In [40]:
my_scorer = pdpipe_scorer_from_sklearn_scorer(ftwo_scorer)

In [41]:
my_scorer

<PdPipeScorer: make_scorer(fbeta_score, beta=2)>

In [42]:
gcv = GridSearchCV(
    estimator=mp,
    param_grid={
        'savings_max_val': [99, 101],
        'scale_numeric': [True, False],
        'drop_gender': [True, False],
        'ohencode_country': [True, False],
    },
    cv=3,
    scoring=my_scorer,
)

In [43]:
gcv.fit(all_x, all_y)

GridSearchCV(cv=3,
             ('estimator', <PdPipeline -> LogisticRegression>),
             param_grid={'drop_gender': [True, False],
                         'ohencode_country': [True, False],
                         'savings_max_val': [99, 101],
                         'scale_numeric': [True, False]},
             scoring=<PdPipeScorer: make_scorer(fbeta_score, beta=2)>)

In [44]:
gcv.best_score_

0.30303030303030304

In [46]:
gcv.best_params_

{'drop_gender': True,
 'ohencode_country': True,
 'savings_max_val': 99,
 'scale_numeric': True}

In [45]:
gcv.cv_results_

{'mean_fit_time': array([0.01721199, 0.01194263, 0.01230669, 0.01273394, 0.01404134,
        0.01371368, 0.01329859, 0.01119653, 0.01308044, 0.01597126,
        0.01527802, 0.01333801, 0.01295455, 0.0133361 , 0.0152994 ,
        0.01477504]),
 'std_fit_time': array([0.00383527, 0.00059531, 0.00092171, 0.00038528, 0.00077348,
        0.00052551, 0.00165672, 0.00051181, 0.00056855, 0.0021009 ,
        0.00034942, 0.00079891, 0.00089399, 0.00016949, 0.00051471,
        0.00028866]),
 'mean_score_time': array([0.011856  , 0.01003202, 0.00967741, 0.01085448, 0.01215593,
        0.01150568, 0.01012762, 0.009173  , 0.0100011 , 0.01422445,
        0.01121553, 0.01066399, 0.01144632, 0.01270469, 0.01220814,
        0.01292666]),
 'std_score_time': array([0.00103129, 0.00091208, 0.00061078, 0.00102057, 0.00046995,
        0.00052332, 0.00088413, 0.00060296, 0.00057559, 0.00236275,
        0.00095284, 0.00075483, 0.0002575 , 0.00114321, 0.00084593,
        0.00201821]),
 'param_drop_gender': mask