### Reference: https://github.com/rachittoshniwal/machineLearning/blob/master/pipeline.ipynb

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

In [4]:
df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [7]:
df.shape

(32561, 15)

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(30162, 15)

In [10]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [11]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [13]:
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), 
                                                    df.income, test_size=0.2, random_state=0)

In [16]:
num_cols = [col for col in X_train.columns if X_train[col].dtypes!='O']

In [17]:
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [18]:
cat_cols = [col for col in X_train.columns if X_train[col].dtypes=='O']

In [19]:
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [20]:
df[['education', 'education-num']].head()

Unnamed: 0,education,education-num
0,Bachelors,13
1,Bachelors,13
2,HS-grad,9
3,11th,7
4,Bachelors,13


In [21]:
ct = ColumnTransformer([
    ('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),
    ('step2', StandardScaler(), ['capital-gain', 'capital-loss', 'education-num']),
    ('step3', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['workclass', 
                                                                     'marital-status', 'occupation',
                                                                     'relationship', 'race', 
                                                                     'sex', 'native-country'])
], remainder='drop')

In [22]:
ct 

ColumnTransformer(transformers=[('step1', RobustScaler(),
                                 ['age', 'fnlwgt', 'hours-per-week']),
                                ('step2', StandardScaler(),
                                 ['capital-gain', 'capital-loss',
                                  'education-num']),
                                ('step3',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['workclass', 'marital-status', 'occupation',
                                  'relationship', 'race', 'sex',
                                  'native-country'])])

#### pipeline use case 1 - with an 'estimator' as final step