In [11]:
import pandas as pd

data = {'Name':['Anaan','Bob','Charlie','Diana','Eric'],
        'Age' : [20,34,23,None,33],
        'Gender': ['f','m','m','f','m'],
        'Job':['Programmer','Writer','Cook','Programmer','Teacher']}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Anaan,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


Preprocessing Pipeline : 

- Drop Name Feature
- Impute Ages
- Turn Gender into Binary/Numeric
- One Hot Encode Jobs

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name Feature
df = df.drop(['Name'], axis = 1)

# Impute Ages
Imputer = SimpleImputer(strategy = 'mean')
df['Age'] = Imputer.fit_transform(df[['Age']])

# Turn Gender into Numeric
gender_dot = {'m':0, 'f':1}
df['Gender'] = [gender_dot[i] for i in df['Gender']]

# One Hot Encode Jobs
encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[['Job']]).toarray()
column_names = ['Programmer','Writer','Cook','Teacher']

for i in range(len(matrix.T)): # len(행렬)의 결과는 행 수
    df[column_names[i]] = matrix.T[i]

df = df.drop(['Job'], axis=1)

df

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(['Name'], axis=1)

class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Imputer = SimpleImputer(strategy = 'mean')
        X['Age'] = Imputer.fit_transform(X[['Age']])
        return X

class FeatureEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        gender_dot = {'m':0, 'f':1}
        X['Gender'] = [gender_dot[i] for i in X['Gender']]

        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Job']]).toarray()
        column_names = ['Programmer','Writer','Cook','Teacher']
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        return X.drop(['Job'], axis=1)

In [17]:
data = {'Name':['Minjee','Junyoung','Eunjee','Yuhee','Haeun'],
        'Age' : [26,29,24,None,None],
        'Gender': ['f','m','f','f','f'],
        'Job':['Programmer','Teacher','Programmer','Cook','Programmer']}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,Name,Age,Gender,Job
0,Minjee,26.0,f,Programmer
1,Junyoung,29.0,m,Teacher
2,Eunjee,24.0,f,Programmer
3,Yuhee,,f,Cook
4,Haeun,,f,Programmer


In [20]:
dropper = NameDropper()
imp = AgeImputer()
enc = FeatureEncoder()

enc.fit_transform(imp.fit_transform(dropper.fit_transform(df2)))

Unnamed: 0,Age,Gender,Programmer,Writer,Cook
0,26.0,1,0.0,1.0,0.0
1,29.0,0,0.0,0.0,1.0
2,24.0,1,0.0,1.0,0.0
3,26.333333,1,1.0,0.0,0.0
4,26.333333,1,0.0,1.0,0.0


In [21]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('dropper', NameDropper()),
    ('imp', AgeImputer()),
    ('enc', FeatureEncoder())
])

pipe.fit_transform(df2)

Unnamed: 0,Age,Gender,Programmer,Writer,Cook
0,26.0,1,0.0,1.0,0.0
1,29.0,0,0.0,0.0,1.0
2,24.0,1,0.0,1.0,0.0
3,26.333333,1,1.0,0.0,0.0
4,26.333333,1,0.0,1.0,0.0
