# Scikit-learn Pipeline with Pandas

While `sklearn` prefers using `numpy` arrays, it's often convenient to use `pandas` DataFrames for future processing.

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn_pandas import DataFrameMapper, gen_features
from mlutil.transform import ColumnSelector

np.set_printoptions(edgeitems=30, linewidth=10000, precision=2)

In [2]:
df = pd.DataFrame({
    'a': np.random.normal(size=6),
    'b': np.arange(6),
    'c': ['black', 'black', 'white', 'black', 'white', 'black'],
    'd': [None, 'dog', 'cat', 'cat', 'cat', 'cat'],
    'e': [np.nan, 1, 1, 2, 2, 2],
})
df

Unnamed: 0,a,b,c,d,e
0,-0.159194,0,black,,
1,-1.00208,1,black,dog,1.0
2,0.176486,2,white,cat,1.0
3,0.43731,3,black,cat,2.0
4,-0.693517,4,white,cat,2.0
5,-0.740792,5,black,cat,2.0


## scikit-learn only

In [3]:
# OneHot doesn't know how to treat NaNs
try:
    OneHotEncoder(sparse=False).fit_transform(df)
except ValueError as e:
    print(e)

Input contains NaN, infinity or a value too large for dtype('float64').


In [4]:
# OneHot transforms columns with any dtype 
t = OneHotEncoder(sparse=False)
display(t.fit_transform(df[['a', 'b', 'c']]))
print(t.categories_)

array([[0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.]])

[array([-1.  , -0.74, -0.69, -0.16,  0.18,  0.44]), array([0, 1, 2, 3, 4, 5]), array(['black', 'white'], dtype=object)]


In [5]:
# with ColumnTransformer
pipe = Pipeline([
    ('t', ColumnTransformer([
        ('c', Pipeline([
                ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore')),
            ]),
        ['c']),
        ('d', Pipeline([
                ('nans', SimpleImputer(missing_values=None, fill_value='none', strategy='constant')),
                ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore')),
            ]),
        ['d']),
        ('e', Pipeline([
                ('nans', SimpleImputer(missing_values=np.nan, fill_value=0, strategy='constant')),
                ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore')),
            ]),
        ['e']),
    ], remainder='passthrough'),),
])

pipe.fit_transform(df)

array([[ 1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  1.  ,  0.  ,  0.  , -0.16,  0.  ],
       [ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  , -1.  ,  1.  ],
       [ 0.  ,  1.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.18,  2.  ],
       [ 1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.44,  3.  ],
       [ 0.  ,  1.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  , -0.69,  4.  ],
       [ 1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  , -0.74,  5.  ]])

In [6]:
# with GridSearch

pipe.steps.append(('estimator', LinearRegression()))
m = GridSearchCV(pipe, {'estimator__normalize':[True, False]}, scoring='neg_mean_absolute_error')
_ = m.fit(df, np.random.normal(size=len(df)))
m.best_params_

{'estimator__normalize': True}

## sklearn-pandas

In [7]:
# Transform one selected column with sklearn-pandas
mapper = DataFrameMapper([
    (['c'], OneHotEncoder(sparse=False)),
], df_out=True, default=None)
mapper.fit_transform(df)

Unnamed: 0,c_x0_black,c_x0_white,a,b,d,e
0,1.0,0.0,-0.159194,0,,
1,1.0,0.0,-1.00208,1,dog,1.0
2,0.0,1.0,0.176486,2,cat,1.0
3,1.0,0.0,0.43731,3,cat,2.0
4,0.0,1.0,-0.693517,4,cat,2.0
5,1.0,0.0,-0.740792,5,cat,2.0


In [8]:
# Transform several selected column with sklearn-pandas (v1)
pipe = Pipeline([
    ('nans', DataFrameMapper([
        (['d'], SimpleImputer(missing_values=None, fill_value='none', strategy='constant')),
        (['e'], SimpleImputer(missing_values=np.nan, fill_value=0, strategy='constant')),
    ], df_out=True, default=None)),
    ('onehot', DataFrameMapper([
        (['c', 'd', 'e'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ], df_out=True, default=None))
])
display(pipe.fit_transform(df))
print(pipe.named_steps['onehot'].transformed_names_)

Unnamed: 0,c_d_e_x0_black,c_d_e_x0_white,c_d_e_x1_cat,c_d_e_x1_dog,c_d_e_x1_none,c_d_e_x2_0.0,c_d_e_x2_1.0,c_d_e_x2_2.0,a,b
0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-0.159194,0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.00208,1
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.176486,2
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.43731,3
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.693517,4
5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.740792,5


['c_d_e_x0_black', 'c_d_e_x0_white', 'c_d_e_x1_cat', 'c_d_e_x1_dog', 'c_d_e_x1_none', 'c_d_e_x2_0.0', 'c_d_e_x2_1.0', 'c_d_e_x2_2.0', 'a', 'b']


In [9]:
# Transform several selected column with sklearn-pandas (v2)
pipe = Pipeline([
    ('nans', DataFrameMapper([
        (['d'], SimpleImputer(missing_values=None, fill_value='none', strategy='constant')),
        (['e'], SimpleImputer(missing_values=np.nan, fill_value=0, strategy='constant')),
    ], df_out=True, default=None)),
    ('onehot', DataFrameMapper([
        (['c'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
        (['d'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
        (['e'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ], df_out=True, default=None))
])
display(pipe.fit_transform(df))
print(pipe.named_steps['onehot'].transformed_names_)

Unnamed: 0,c_x0_black,c_x0_white,d_x0_cat,d_x0_dog,d_x0_none,e_x0_0.0,e_x0_1.0,e_x0_2.0,a,b
0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-0.159194,0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.00208,1
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.176486,2
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.43731,3
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.693517,4
5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.740792,5


['c_x0_black', 'c_x0_white', 'd_x0_cat', 'd_x0_dog', 'd_x0_none', 'e_x0_0.0', 'e_x0_1.0', 'e_x0_2.0', 'a', 'b']


In [10]:
# Transform several selected column with sklearn-pandas (v3)
pipe = Pipeline([
    ('nans', DataFrameMapper([
        (['d'], SimpleImputer(missing_values=None, fill_value='none', strategy='constant')),
        (['e'], SimpleImputer(missing_values=np.nan, fill_value=0, strategy='constant')),
    ], df_out=True, default=None)),
    ('onehot', DataFrameMapper(gen_features(
        columns=[['c'], ['d'], ['e']],
        classes=[{'class': OneHotEncoder, 'sparse':False, 'handle_unknown':'ignore'}]
    ), df_out=True, default=None))
])
display(pipe.fit_transform(df))
print(pipe.named_steps['onehot'].transformed_names_)

Unnamed: 0,c_x0_black,c_x0_white,d_x0_cat,d_x0_dog,d_x0_none,e_x0_0.0,e_x0_1.0,e_x0_2.0,a,b
0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-0.159194,0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.00208,1
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.176486,2
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.43731,3
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.693517,4
5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.740792,5


['c_x0_black', 'c_x0_white', 'd_x0_cat', 'd_x0_dog', 'd_x0_none', 'e_x0_0.0', 'e_x0_1.0', 'e_x0_2.0', 'a', 'b']


In [11]:
# with GridSearch

pipe.steps.append(('estimator', LinearRegression()))
m = GridSearchCV(pipe, {'estimator__normalize':[True, False]}, scoring='neg_mean_absolute_error')
_ = m.fit(df, np.random.normal(size=len(df)))
m.best_params_

{'estimator__normalize': False}

## mlutil

In [12]:
pipe = Pipeline([
    ('nans_d', ColumnSelector(
        SimpleImputer(missing_values=None, fill_value='none', strategy='constant'), 
        ['d'],
    )),
    ('nans_e', ColumnSelector(
        SimpleImputer(missing_values=np.nan, fill_value=0, strategy='constant'),
        ['e'],
    )),
    ('onehot', ColumnSelector(
        OneHotEncoder(sparse=False, handle_unknown='ignore'),
        ['c', 'd', 'e'], 
        infer_new_columns='same_attr', new_columns_attr='categories_',
    )),
])
display(pipe.fit_transform(df))

Unnamed: 0,a,b,c_black,c_white,d_cat,d_dog,d_none,e_0.0,e_1.0,e_2.0
0,-0.159194,0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-1.00208,1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.176486,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.43731,3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-0.693517,4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
5,-0.740792,5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# with GridSearch

pipe.steps.append(('estimator', LinearRegression()))
m = GridSearchCV(pipe, {'estimator__normalize':[True, False]}, scoring='neg_mean_absolute_error')
_ = m.fit(df, np.random.normal(size=len(df)))
m.best_params_

{'estimator__normalize': True}