# Scikit-learn Pipeline with Pandas

While `sklearn` prefers using `numpy` arrays, it's often convenient to use `pandas` DataFrames for future processing.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper, gen_features

In [2]:
df = pd.DataFrame({
    'a': np.random.normal(size=5),
    'b': np.arange(5),
    'c': ['dog', 'dog', 'cat', 'cat', 'cat'],
    'd': [None, 'dog', 'cat', 'cat', 'cat'],
    'e': [np.nan, 1, 1, 2, 2],
})
df

Unnamed: 0,a,b,c,d,e
0,0.023779,0,dog,,
1,0.098603,1,dog,dog,1.0
2,-0.392206,2,cat,cat,1.0
3,-0.835281,3,cat,cat,2.0
4,0.153197,4,cat,cat,2.0


In [10]:
# OneHot doesn't know how to treat NaNs
try:
    OneHotEncoder(sparse=False).fit_transform(df)
except ValueError as e:
    print(e)

Input contains NaN, infinity or a value too large for dtype('float64').


In [9]:
# OneHot transforms columns with any dtype 
OneHotEncoder(sparse=False).fit_transform(df[['a', 'b', 'c']])

array([[0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.]])

In [12]:
# Transform one selected column with sklearn-pandas
mapper = DataFrameMapper([
    (['c'], OneHotEncoder(sparse=False)),
], df_out=True, default=None)
mapper.fit_transform(df)

Unnamed: 0,c_x0_cat,c_x0_dog,a,b,d,e
0,0.0,1.0,0.0237788,0,,
1,0.0,1.0,0.0986032,1,dog,1.0
2,1.0,0.0,-0.392206,2,cat,1.0
3,1.0,0.0,-0.835281,3,cat,2.0
4,1.0,0.0,0.153197,4,cat,2.0


In [44]:
# Transform several selected column with sklearn-pandas
pipe = Pipeline([
    ('nans', DataFrameMapper([
        (['d'], SimpleImputer(missing_values=None, fill_value='none', strategy='constant')),
        (['e'], SimpleImputer(missing_values=np.nan, fill_value=0, strategy='constant')),
    ], df_out=True, default=None)),
    ('onehot', DataFrameMapper([
        (['c'], OneHotEncoder(sparse=False)),
        (['d'], OneHotEncoder(sparse=False)),
        (['e'], OneHotEncoder(sparse=False)),
    ], df_out=True, default=None))
])
display(pipe.fit_transform(df))
print(pipe.named_steps['onehot'].transformed_names_)

Unnamed: 0,c_x0_cat,c_x0_dog,d_x0_cat,d_x0_dog,d_x0_none,e_x0_0.0,e_x0_1.0,e_x0_2.0,a,b
0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0237788,0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0986032,1
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.392206,2
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.835281,3
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.153197,4


['c_x0_cat', 'c_x0_dog', 'd_x0_cat', 'd_x0_dog', 'd_x0_none', 'e_x0_0.0', 'e_x0_1.0', 'e_x0_2.0', 'a', 'b']


In [48]:
# Transform several selected column with sklearn-pandas (v2)
pipe = Pipeline([
    ('nans', DataFrameMapper([
        (['d'], SimpleImputer(missing_values=None, fill_value='none', strategy='constant')),
        (['e'], SimpleImputer(missing_values=np.nan, fill_value=0, strategy='constant')),
    ], df_out=True, default=None)),
    ('onehot', DataFrameMapper(gen_features(
        columns=[['c'], ['d'], ['e']],
        classes=[{'class': OneHotEncoder, 'sparse':False}]
    ), df_out=True, default=None))
])
display(pipe.fit_transform(df))
print(pipe.named_steps['onehot'].transformed_names_)

Unnamed: 0,c_x0_cat,c_x0_dog,d_x0_cat,d_x0_dog,d_x0_none,e_x0_0.0,e_x0_1.0,e_x0_2.0,a,b
0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0237788,0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0986032,1
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.392206,2
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.835281,3
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.153197,4


['c_x0_cat', 'c_x0_dog', 'd_x0_cat', 'd_x0_dog', 'd_x0_none', 'e_x0_0.0', 'e_x0_1.0', 'e_x0_2.0', 'a', 'b']
