In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [21]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import xgboost as xgb
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.impute import SimpleImputer

# Ames, Iowa Housing Prices Dataset

In [5]:
ames = pd.read_csv('./ames.csv')
print(ames.shape)
ames.head()

(1460, 21)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Neighborhood,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,...,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,Fireplaces,GarageArea,PavedDrive,SalePrice
0,60,RL,65.0,8450,CollgCr,1Fam,2Story,7,5,2003,...,1710,1,0,2,1,3,0,548,Y,208500
1,20,RL,80.0,9600,Veenker,1Fam,1Story,6,8,1976,...,1262,0,1,2,0,3,1,460,Y,181500
2,60,RL,68.0,11250,CollgCr,1Fam,2Story,7,5,2001,...,1786,1,0,2,1,3,1,608,Y,223500
3,70,RL,60.0,9550,Crawfor,1Fam,2Story,7,5,1915,...,1717,1,0,1,0,3,1,642,Y,140000
4,60,RL,84.0,14260,NoRidge,1Fam,2Story,8,5,2000,...,2198,1,0,2,1,4,1,836,Y,250000


In [6]:
ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MSSubClass    1460 non-null   int64  
 1   MSZoning      1460 non-null   object 
 2   LotFrontage   1201 non-null   float64
 3   LotArea       1460 non-null   int64  
 4   Neighborhood  1460 non-null   object 
 5   BldgType      1460 non-null   object 
 6   HouseStyle    1460 non-null   object 
 7   OverallQual   1460 non-null   int64  
 8   OverallCond   1460 non-null   int64  
 9   YearBuilt     1460 non-null   int64  
 10  Remodeled     1460 non-null   int64  
 11  GrLivArea     1460 non-null   int64  
 12  BsmtFullBath  1460 non-null   int64  
 13  BsmtHalfBath  1460 non-null   int64  
 14  FullBath      1460 non-null   int64  
 15  HalfBath      1460 non-null   int64  
 16  BedroomAbvGr  1460 non-null   int64  
 17  Fireplaces    1460 non-null   int64  
 18  GarageArea    1460 non-null 

## Encoding categorical colums with `LabelEncoder`

In [31]:
ames_ = ames.copy()

In [32]:
# Fill missing values with 0
ames_['LotFrontage'] = ames_['LotFrontage'].fillna(0)

In [33]:
# create boolean mask for categorical columns
categorical_mask = (ames_.dtypes == 'object')

In [34]:
# get list of categorical column names
categorical_columns = ames_.columns[categorical_mask].tolist()

In [35]:
ames_[categorical_columns].head()

Unnamed: 0,MSZoning,Neighborhood,BldgType,HouseStyle,PavedDrive
0,RL,CollgCr,1Fam,2Story,Y
1,RL,Veenker,1Fam,1Story,Y
2,RL,CollgCr,1Fam,2Story,Y
3,RL,Crawfor,1Fam,2Story,Y
4,RL,NoRidge,1Fam,2Story,Y


In [36]:
# create LabelEncoder object
le = LabelEncoder()

In [37]:
# Apply le to categorical columns
ames_[categorical_columns] = ames_[categorical_columns].apply(lambda x: le.fit_transform(x))

In [38]:
ames_[categorical_columns].head()

Unnamed: 0,MSZoning,Neighborhood,BldgType,HouseStyle,PavedDrive
0,3,5,0,5,2
1,3,24,0,2,2
2,3,5,0,5,2
3,3,6,0,5,2
4,3,15,0,5,2


## Encoding categorical columns next with `OneHotEncoder`

The `categorical_features` keyword of `OneHotEncoder` has been deprecated in version 0.20. As a workaround, use `ColumnTransformer` instead.

In [58]:
# create OneHotEncoder object
ohe = ColumnTransformer([('onehot', OneHotEncoder(), categorical_columns)], remainder='passthrough')

In [59]:
# apply to categorical columns
df_encoded = ohe.fit_transform(ames_)

In [64]:
type(df_encoded)

numpy.ndarray

In [65]:
print(df_encoded[:5, :])

[[0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.000e+00 6.000e+01 6.500e+01 8.450e+03
  7.000e+00 5.000e+00 2.003e+03 0.000e+00 1.710e+03 1.000e+00 0.000e+00
  2.000e+00 1.000e+00 3.000e+00 0.000e+00 5.480e+02 2.085e+05]
 [0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 1.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 

In [66]:
ames_.shape

(1460, 21)

In [67]:
df_encoded.shape

(1460, 62)

## Encoding categorical columns with `DictVectorizer`
The two step process of `LabelEncoder` then `OneHotEncoder` can be simplified with this.

In [69]:
ames_ = ames.copy()
ames_['LotFrontage'] = ames_['LotFrontage'].fillna(0)

In [70]:
# convert dataframe into a dict
df_dict = ames_.to_dict('records')

In [71]:
# create DictVectorizer object
dv = DictVectorizer(sparse=False)

In [72]:
# apply dv on dataframe
df_encoded = dv.fit_transform(df_dict)

In [73]:
print(df_encoded[:5, :])

[[3.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
  0.000e+00 0.000e+00 2.000e+00 5.480e+02 1.710e+03 1.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  8.450e+03 6.500e+01 6.000e+01 0.000e+00 0.000e+00 0.000e+00 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 5.000e+00 7.000e+00
  0.000e+00 0.000e+00 1.000e+00 0.000e+00 2.085e+05 2.003e+03]
 [3.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  1.000e+00 1.000e+00 2.000e+00 4.600e+02 1.262e+03 0.000e+00 0.000e+00
  0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  9.600e+03 8.000e+01 2.000e+01 0.000e+00 0.000e+00 0.000e+00 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 

In [74]:
dv.vocabulary_

{'MSSubClass': 23,
 'MSZoning=RL': 27,
 'LotFrontage': 22,
 'LotArea': 21,
 'Neighborhood=CollgCr': 34,
 'BldgType=1Fam': 1,
 'HouseStyle=2Story': 18,
 'OverallQual': 55,
 'OverallCond': 54,
 'YearBuilt': 61,
 'Remodeled': 59,
 'GrLivArea': 11,
 'BsmtFullBath': 6,
 'BsmtHalfBath': 7,
 'FullBath': 9,
 'HalfBath': 12,
 'BedroomAbvGr': 0,
 'Fireplaces': 8,
 'GarageArea': 10,
 'PavedDrive=Y': 58,
 'SalePrice': 60,
 'Neighborhood=Veenker': 53,
 'HouseStyle=1Story': 15,
 'Neighborhood=Crawfor': 35,
 'Neighborhood=NoRidge': 44,
 'Neighborhood=Mitchel': 40,
 'HouseStyle=1.5Fin': 13,
 'Neighborhood=Somerst': 50,
 'Neighborhood=NWAmes': 43,
 'MSZoning=RM': 28,
 'Neighborhood=OldTown': 46,
 'Neighborhood=BrkSide': 32,
 'BldgType=2fmCon': 2,
 'HouseStyle=1.5Unf': 14,
 'Neighborhood=Sawyer': 48,
 'Neighborhood=NridgHt': 45,
 'Neighborhood=NAmes': 41,
 'BldgType=Duplex': 3,
 'Neighborhood=SawyerW': 49,
 'Neighborhood=IDOTRR': 38,
 'PavedDrive=N': 56,
 'Neighborhood=MeadowV': 39,
 'BldgType=TwnhsE': 

## Preprocessing within a pipeline

In [79]:
ames_ = ames.copy()
ames_['LotFrontage'] = ames_['LotFrontage'].fillna(0)

In [80]:
X = ames_.drop('SalePrice', axis=1)
y = ames_['SalePrice']

In [81]:
X.shape

(1460, 20)

In [82]:
y.shape

(1460,)

In [85]:
# setup pipeline steps
steps = [('ohe_onestep', DictVectorizer(sparse=False)),
         ('xgb_model', xgb.XGBRegressor())]

In [86]:
# create pipeline model
xgb_pipeline = Pipeline(steps)

In [87]:
# fit to data
# make sure to convert X into dict first
xgb_pipeline.fit(X.to_dict('records'), y)



Pipeline(steps=[('ohe_onestep', DictVectorizer(sparse=False)),
                ('xgb_model', XGBRegressor())])

## Cross-validating an `XGBoost` model

In [89]:
# setup pipeline steps
steps = [('ohe_onestep', DictVectorizer(sparse=False)),
         ('xgb_model', xgb.XGBRegressor(max_depth=2,
                                        objective='reg:linear'))]

# create pipeline model
xgb_pipeline = Pipeline(steps)

In [90]:
# perform cv
cross_val_scores = cross_val_score(estimator=xgb_pipeline, 
                                   X=X.to_dict('records'),
                                   y=y,
                                   scoring='neg_mean_squared_error',
                                   cv=10)



In [91]:
print(f'10-fold RMSE: {np.mean(np.sqrt(np.abs(cross_val_scores)))}')

10-fold RMSE: 29867.603720688923


# [Chronic Kidney Disease](https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease) Dataset

In [5]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [6]:
names = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod',
         'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']

kidney = pd.read_csv('./chronic_kidney_disease.csv', names=names, na_values=['?'])
print(kidney.shape)
kidney.head()

(400, 25)


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [7]:
kidney_feature_names = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 
                        'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
kidney_target_name = 'class'

In [8]:
X = kidney[kidney_feature_names]
y = kidney[kidney_target_name].map({'ckd': 0, 'notckd': 1}).to_numpy()

In [9]:
print(X.shape)
X.head()

(400, 24)


Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,,,normal,notpresent,notpresent,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,normal,normal,notpresent,notpresent,no,no,no,good,no,no


In [10]:
# check number of nulls in each column
nulls_per_columns = X.isna().sum()
nulls_per_columns

age        9
bp        12
sg        47
al        46
su        49
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wc       106
rc       131
rbc      152
pc        65
pcc        4
ba         4
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
dtype: int64

## Imputation of numerical and categorical features

In [11]:
# create boolean mask for categorical columns
categorical_feature_mask = (X.dtypes == object)

# get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

In [12]:
# apply numeric imputer
numeric_imputation_mapper = DataFrameMapper([([numeric_feature], SimpleImputer(missing_values=np.nan, strategy='median')) for numeric_feature in non_categorical_columns],
                                            input_df=True, df_out=True)

In [13]:
# apply categorical imputer
categorical_imputation_mapper = DataFrameMapper([(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True, df_out=True)

## Feature Union

In [14]:
# combine the numeric and categorical transformations
num_cat_union = FeatureUnion([('num_mapper', numeric_imputation_mapper),
                              ('cat_mapper', categorical_imputation_mapper)])

## Full pipeline

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

# Define Dictifier class to turn df into dictionary as part of pipeline
class Dictifier(BaseEstimator, TransformerMixin):       
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if type(X) == pd.core.frame.DataFrame:
            return X.to_dict("records")
        else:
            return pd.DataFrame(X).to_dict("records")

In [16]:
# setup steps
steps = [('featureunion', num_cat_union),
         ('dictifier', Dictifier()),
         ('vectorizer', DictVectorizer(sort=False)),
         ('clf', xgb.XGBClassifier(max_depth=3))]

pipeline = Pipeline(steps)

In [17]:
# perform cv
cv_scores = cross_val_score(estimator=pipeline,
                            X=X,
                            y=y,
                            scoring='roc_auc', 
                            cv=3)

In [19]:
print(f'3-fold AUC: {np.mean(cv_scores)}')

3-fold AUC: 0.998637406769937


## with Random Search

In [20]:
param_grid = {'clf__learning_rate': np.arange(0.05, 1, 0.05),
              'clf__max_depth': range(3, 10, 1),
              'clf__n_estimators': range(50, 200, 50)}

In [26]:
random_roc_auc = RandomizedSearchCV(estimator=pipeline,
                                    param_distributions=param_grid,
                                    n_iter=6,
                                    cv=2,
                                    scoring='roc_auc',
                                    verbose=1, 
                                    n_jobs=-1)

In [27]:
random_roc_auc.fit(X, y)

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   13.4s finished


RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('featureunion',
                                              FeatureUnion(transformer_list=[('num_mapper',
                                                                              DataFrameMapper(df_out=True,
                                                                                              features=[(['age'],
                                                                                                         SimpleImputer(strategy='median')),
                                                                                                        (['bp'],
                                                                                                         SimpleImputer(strategy='median')),
                                                                                                        (['sg'],
                                                                                              

In [28]:
print(random_roc_auc.best_estimator_)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('num_mapper',
                                                 DataFrameMapper(df_out=True,
                                                                 features=[(['age'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['bp'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['sg'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['al'],
                                                                            SimpleImputer(strategy='median')),
                                             

In [29]:
print(f'Best score: {random_roc_auc.best_score_}')

Best score: 0.9980266666666666
