### **Predict Kidney Disease**

In [1]:
import pandas as pd

X = pd.read_csv('./datasets/chronic_kidney_X.csv')
y = pd.read_csv('./datasets/chronic_kidney_y.csv').to_numpy().ravel()

In [2]:
# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)

age        9
bp        12
sg        47
al        46
su        49
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wc       106
rc       131
rbc      152
pc        65
pcc        4
ba         4
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
dtype: int64


We can see that there are some `NaN` value. Let's impute them.

In [3]:
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.impute import SimpleImputer

# Create a boolean mask for categorical columns
categorical_feature_mask = (X.dtypes == object)

# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer using median strategy
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature], SimpleImputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer. The default strategy here is "mode"
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

Now let's combine the imputed data

In [4]:
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])

Now it's time for a full pipeline

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# Define Dictifier class to turn df into dictionary as part of pipeline
class Dictifier(BaseEstimator, TransformerMixin):       
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if type(X) == pd.core.frame.DataFrame:
            return X.to_dict("records")
        else:
            return pd.DataFrame(X).to_dict("records")

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
import numpy as np
import xgboost as xgb

# Create full pipeline
pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union), # output in df
                     ("dictifier", Dictifier()), # convert df into lists
                     ("vectorizer", DictVectorizer(sort=False)), # convert features lists into scipy format
                     ("clf", xgb.XGBClassifier(use_label_encoder=False,eval_metric='logloss',max_depth=3))
                    ])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=10)

# Print avg. AUC
print("10-fold AUC: ", np.mean(cross_val_scores))

10-fold AUC:  0.9992000000000001


99.92% is pretty good! Let's see what happened if we tune the model

In [7]:
from sklearn.model_selection import RandomizedSearchCV

# Create the parameter grid
gbm_param_grid = {
    'clf__learning_rate': np.arange(0.05, 1, 0.05),
    'clf__max_depth': np.arange(3, 10, 1),
    'clf__n_estimators': np.arange(50, 200, 50)
}

# Perform RandomizedSearchCV
randomized_roc_auc = RandomizedSearchCV(estimator=pipeline,param_distributions=gbm_param_grid,scoring='roc_auc',verbose=1,cv=10)

# Fit the estimator
randomized_roc_auc.fit(X,y)

# Compute metrics
print("Best rmse: ", np.sqrt(np.abs(randomized_roc_auc.best_score_)))
print("Best model: ", randomized_roc_auc.best_estimator_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best rmse:  0.999599919967984
Best model:  Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('num_mapper',
                                                 DataFrameMapper(df_out=True,
                                                                 features=[(['age'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['bp'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['sg'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['al'],
                                                  

0.0399% improvement in exchange of 40 secs runtime. Is it worth it?