numerical_cols

In [1]:
from pipeline_perso import CustomImputer
import numpy as np
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [2]:
X_full=pd.read_csv('kidney_disease.csv')
# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['classification'], inplace=True)
y = X_full.classification
X_full.drop(['classification'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()


In [10]:
num_imputer=CustomImputer(missing_values=np.nan, strategy='median')
cat_imputer=CustomImputer(missing_values=np.nan, strategy='most_frequent')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer, numerical_cols),
        ('cat', cat_imputer, categorical_cols)
    ])


In [11]:

from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor)
                             ])



In [12]:
my_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  CustomImputer(strategy='median'),
                                                  ['id', 'age', 'bp', 'sg',
                                                   'al', 'su', 'bgr', 'bu',
                                                   'sc', 'sod', 'pot',
                                                   'hemo']),
                                                 ('cat',
                                                  CustomImputer(strategy='most_frequent'),
                                                  ['rbc', 'pc', 'pcc', 'ba',
                                                   'htn', 'dm', 'cad', 'appet',
                                                   'pe', 'ane'])]))])

In [13]:
#example of how we can use the custom imputer on numerical data
num_imputer.transform(X_train[numerical_cols])

array([[336. ,  25. ,  60. , ..., 138. ,   4.4,  15.2],
       [ 64. ,  55. ,  80. , ..., 138. ,   4.4,   9.8],
       [ 55. ,  35. ,  80. , ..., 138. ,   4.4,   9.5],
       ...,
       [117. ,  54. ,  70. , ..., 139. ,   3.7,  12.5],
       [ 47. ,  11. ,  80. , ..., 138. ,   4.4,  15. ],
       [172. ,  62. ,  80. , ..., 130. ,   2.5,  10.6]])

In [14]:
cat_imputer.transform(X_train[categorical_cols])

array([['normal', 'normal', 'notpresent', ..., 'good', 'no', 'no'],
       ['normal', 'normal', 'notpresent', ..., 'good', 'no', 'no'],
       ['abnormal', 'normal', 'notpresent', ..., 'good', 'yes', 'no'],
       ...,
       ['normal', 'normal', 'notpresent', ..., 'good', 'no', 'no'],
       ['normal', 'normal', 'notpresent', ..., 'good', 'no', 'no'],
       ['normal', 'normal', 'notpresent', ..., 'good', 'no', 'no']],
      dtype=object)

In [8]:
categorical_cols

['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

In [9]:
X_train

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,...,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
336,normal,normal,notpresent,notpresent,no,no,no,good,no,no,...,60.0,1.020,0.0,0.0,119.0,27.0,0.5,,,15.2
64,,normal,notpresent,notpresent,no,no,\tno,good,no,no,...,80.0,1.010,0.0,0.0,146.0,,,,,9.8
55,abnormal,normal,notpresent,notpresent,no,no,no,good,yes,no,...,80.0,1.005,3.0,0.0,,,,,,9.5
106,,,notpresent,notpresent,yes,yes,no,good,yes,yes,...,90.0,,,,89.0,118.0,6.1,127.0,4.4,6.0
300,normal,normal,,,no,no,no,good,no,no,...,60.0,1.020,0.0,0.0,114.0,26.0,0.7,141.0,4.2,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,normal,normal,notpresent,notpresent,no,no,no,good,no,no,...,80.0,1.025,0.0,0.0,130.0,30.0,1.1,143.0,5.0,15.9
192,,normal,notpresent,notpresent,no,no,no,good,no,no,...,110.0,1.015,0.0,0.0,130.0,16.0,0.9,,,
117,,,notpresent,notpresent,no,no,no,good,no,no,...,70.0,1.020,0.0,0.0,219.0,36.0,1.3,139.0,3.7,12.5
47,,normal,notpresent,notpresent,no,no,no,good,no,no,...,80.0,1.010,3.0,0.0,,17.0,0.8,,,15.0
