# Clean Data: The Foundation of Machine Learning

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from missingpy import MissForest



## Load Dataset

In [2]:
missing_values = ["n/a", "na", "--", "?"]

In [3]:
df = pd.read_csv('Data/heart_disease.csv', na_values = missing_values)

## Results of EDA

In [4]:
numeric_features_final = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
nominal_features_final = ['sex', 'cp', 'exang', 'slope', 'thal', 'fbs', 'restecg']
discrete_features_final = ['ca']

### Nominal Features

* Apply OneHot Encoder 

### Discrete Features

* Apply Standerd Scaler

### Numeric Features

* Try Power Transformer with Normalizer
* Try Power Transformer with Robust scaler

### Experiment with

* PCA + RFECV
* Outlier + PCA + RFECV

## Delete Columns That Contain a Single Value

In [5]:
# get number of unique values for each column
counts = df.nunique()
# record columns to delete
to_del = [i for i,v in enumerate(counts) if v == 1]
print (to_del)
print(df.shape)
# drop useless columns
df.drop(to_del, axis=1, inplace=True)
print(df.shape)

[]
(303, 14)
(303, 14)


## Delete Duplicate Rows

In [6]:
df.duplicated().sum()

0

## Missing Data Imputation

In [7]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,male,typical angina,145.0,233.0,True,2.0,150.0,no,2.3,downsloping,0.0,fixed defect,0
1,67.0,male,asymptomatic,160.0,286.0,False,2.0,108.0,yes,1.5,flat,3.0,normal,1
2,67.0,male,asymptomatic,120.0,229.0,False,2.0,129.0,yes,2.6,flat,2.0,reversable defect,1
3,37.0,male,non-anginal pain,130.0,250.0,False,0.0,187.0,no,3.5,downsloping,0.0,normal,0
4,41.0,female,atypical angina,130.0,204.0,False,2.0,172.0,no,1.4,upsloping,0.0,normal,0


In [8]:
df.isna().sum().sum()

6

In [9]:
null_columns = {}

all_columns = df.isnull().sum().sort_values(ascending=False)
for item in all_columns.index:
    if all_columns[item] > 0:
        null_columns[item] = 100* all_columns[item]/len(df)
        
null_columns

{'ca': 1.3201320132013201, 'thal': 0.6600660066006601}

In [10]:
df["ca"].value_counts()

0.0    176
1.0     65
2.0     38
3.0     20
Name: ca, dtype: int64

In [11]:
df["thal"].value_counts()

normal               166
reversable defect    117
fixed defect          18
Name: thal, dtype: int64

thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [12]:
df['thal'] = df['thal'].map({'normal':3, 'fixed defect':6, 'reversable defect':7})

In [13]:
df["thal"].value_counts()

3.0    166
7.0    117
6.0     18
Name: thal, dtype: int64

In [14]:
X = df.drop("num", axis=1)
y = df["num"]

In [15]:
cat_features = nominal_features_final.copy()

In [16]:
nominal_features_final

['sex', 'cp', 'exang', 'slope', 'thal', 'fbs', 'restecg']

In [17]:
cat_features.remove("thal")

In [18]:
cat_features

['sex', 'cp', 'exang', 'slope', 'fbs', 'restecg']

### Statistical Imputation

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

In [20]:
strategies = ['mean', 'median', 'most_frequent', 'constant']

In [21]:
for s in strategies:
    transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(drop='first', sparse=False), 
                                                   cat_features)], remainder='passthrough')
    # create the modeling pipeline
    pipeline = Pipeline(steps=[("trans", transformer), 
                               ('i', SimpleImputer(strategy=s)), 
                               ('m', RandomForestClassifier())])
    # evaluate the model
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('>%s %.3f (%.3f)' % (s, np.mean(scores), np.std(scores)))

>mean 0.821 (0.062)
>median 0.809 (0.064)
>most_frequent 0.814 (0.074)
>constant 0.816 (0.072)


### KNN Imputer

In [22]:
from sklearn.impute import KNNImputer

In [23]:
strategies = [i for i in range(1, 21)]

In [24]:
for s in strategies:
    transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(drop='first', sparse=False), 
                                                   cat_features)], remainder='passthrough')
    # create the modeling pipeline
    pipeline = Pipeline(steps=[("trans", transformer),
                               ('i', KNNImputer(n_neighbors=int(s))), 
                               ('m', RandomForestClassifier())])
    # evaluate the model
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('>%s %.3f (%.3f)' % (s, np.mean(scores), np.std(scores)))

>1 0.809 (0.067)
>2 0.816 (0.067)
>3 0.814 (0.069)
>4 0.804 (0.067)
>5 0.801 (0.074)
>6 0.795 (0.078)
>7 0.817 (0.065)
>8 0.809 (0.068)
>9 0.803 (0.075)
>10 0.799 (0.073)
>11 0.812 (0.071)
>12 0.809 (0.071)
>13 0.804 (0.067)
>14 0.814 (0.070)
>15 0.807 (0.072)
>16 0.812 (0.071)
>17 0.821 (0.076)
>18 0.813 (0.069)
>19 0.812 (0.066)
>20 0.803 (0.065)


### Iterative Imputation

In [25]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [26]:
strategies = ['ascending', 'descending', 'roman', 'arabic', 'random']

In [27]:
for s in strategies:
    transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(drop='first', sparse=False), 
                                                   cat_features)], remainder='passthrough')
    # create the modeling pipeline
    pipeline = Pipeline(steps=[("trans", transformer),
                               ('i', IterativeImputer(imputation_order=s)), 
                               ('m', RandomForestClassifier())])
    # evaluate the model
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('>%s %.3f (%.3f)' % (s, np.mean(scores), np.std(scores)))

>ascending 0.815 (0.070)
>descending 0.811 (0.064)
>roman 0.820 (0.063)
>arabic 0.816 (0.062)
>random 0.811 (0.062)


### MissForest Imputation

In [28]:
from missingpy import MissForest

In [29]:
transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(drop='first', sparse=False), 
                                                   cat_features)], remainder='passthrough')
# create the modeling pipeline
pipeline = Pipeline(steps=[("trans", transformer),
                               ('i', MissForest()), 
                               ('m', RandomForestClassifier())])
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('%.3f (%.3f)' % (np.mean(scores), np.std(scores)))

nan (nan)


### Missing Data Imputation Results

In [30]:
imputer = IterativeImputer(imputation_order='random')
df_null_imputation = X[["ca", "thal"]]
# fit on the dataset
imputer.fit(df_null_imputation)
# transform the dataset
Xtrans = imputer.transform(df_null_imputation)

In [31]:
df_new = pd.DataFrame(Xtrans, columns=["ca", "thal"])

In [32]:
df_new.head()

Unnamed: 0,ca,thal
0,0.0,6.0
1,3.0,3.0
2,2.0,7.0
3,0.0,3.0
4,0.0,3.0


In [33]:
df_new["thal"].value_counts()

3.000000    166
7.000000    117
6.000000     18
4.388078      2
Name: thal, dtype: int64

In [34]:
df_new["thal_new"] = df_new["thal"].apply(lambda x: 3.0 if x == 4.388078 else x)

In [35]:
idx = np.where((df_new["thal_new"] > 3.0) & (df_new["thal_new"]<5.0))

In [36]:
idx

(array([ 87, 266]),)

In [37]:
df_new.loc[87]["thal_new"] = 3.0
df_new.loc[266]["thal_new"] = 3.0

In [38]:
df_new["thal_new"].value_counts()

3.0    168
7.0    117
6.0     18
Name: thal_new, dtype: int64

In [39]:
df_new["ca"].value_counts()

0.000000    176
1.000000     65
2.000000     38
3.000000     20
0.941760      2
0.467182      2
Name: ca, dtype: int64

In [45]:
df_new["ca_new"] = df_new["ca"].copy()

In [46]:
idx = np.where((df_new["ca_new"] > 0.0) & (df_new["ca_new"]<1.0))

In [47]:
df_new.loc[idx]

Unnamed: 0,ca,thal,thal_new,ca_new
166,0.467182,3.0,3.0,0.467182
192,0.94176,7.0,7.0,0.94176
287,0.94176,7.0,7.0,0.94176
302,0.467182,3.0,3.0,0.467182


In [48]:
df_new.loc[166]["ca_new"] = 0.0
df_new.loc[302]["ca_new"] = 0.0
df_new.loc[192]["ca_new"] = 1.0
df_new.loc[287]["ca_new"] = 1.0

In [49]:
df_new["ca_new"].value_counts()

0.0    178
1.0     67
2.0     38
3.0     20
Name: ca_new, dtype: int64

In [50]:
df["ca"] = df_new["ca_new"]

In [51]:
df["thal"] = df_new["thal_new"]

In [52]:
df.isnull().sum().sum()

0

##  Minimum = 0, in Numeric Column

In [53]:
statistics = df.describe()
min_value_zero_columns = [item for item in statistics if statistics[item]['min'] == 0]
min_value_zero_columns

['restecg', 'oldpeak', 'ca', 'num']

ca: number of major vessels (0-3) colored by flourosopy.
"ca" can contain zero (0) value as per definition

num: diagnosis of heart disease (angiographic disease status).
simply attemp to distinguish presence (values 1) from absence (value 0)
So, by defintion "num" can contain zero (0) value

restecg: resting electrocardiographic results -- Value 0: normal. So, "restecg" can contain zero (0) valjues

oldpeak = ST depression induced by exercise relative to rest

In [54]:
df["oldpeak"].value_counts()

0.0    99
1.2    17
1.0    14
0.6    14
0.8    13
1.4    13
0.2    12
1.6    11
1.8    10
2.0     9
0.4     9
0.1     7
2.8     6
2.6     6
0.5     5
3.0     5
1.5     5
1.9     5
2.2     4
3.6     4
3.4     3
0.3     3
2.4     3
0.9     3
4.0     3
4.2     2
2.3     2
3.2     2
2.5     2
1.1     2
3.5     1
1.3     1
5.6     1
0.7     1
2.9     1
3.1     1
3.8     1
2.1     1
6.2     1
4.4     1
Name: oldpeak, dtype: int64

"Need to consult with domain expert regarding zero (0) values"

## Data Preprocessing Results

In [55]:
numeric_features_final = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
nominal_features_final = ['sex', 'cp', 'exang', 'slope', 'thal', 'fbs', 'restecg']
discrete_features_final = ['ca']

In [56]:
df.shape

(303, 14)

In [57]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,male,typical angina,145.0,233.0,True,2.0,150.0,no,2.3,downsloping,0.0,6.0,0
1,67.0,male,asymptomatic,160.0,286.0,False,2.0,108.0,yes,1.5,flat,3.0,3.0,1
2,67.0,male,asymptomatic,120.0,229.0,False,2.0,129.0,yes,2.6,flat,2.0,7.0,1
3,37.0,male,non-anginal pain,130.0,250.0,False,0.0,187.0,no,3.5,downsloping,0.0,3.0,0
4,41.0,female,atypical angina,130.0,204.0,False,2.0,172.0,no,1.4,upsloping,0.0,3.0,0


### Save Clean Data

In [58]:
#saving the dataframe 
df.to_csv('Data/heart_disease_clean.csv', index=None) 

### Data Prparation Pipeline

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from xgboost import XGBClassifier

In [60]:
t = [('cat', OneHotEncoder(drop='first', sparse=False), nominal_features_final), \
     ('discrete', StandardScaler(), discrete_features_final),
     ('power', PowerTransformer(), numeric_features_final),
     ('min_max', MinMaxScaler(), numeric_features_final)]

col_transform = ColumnTransformer(transformers=t)

pipeline = Pipeline(steps=[('prep', col_transform), ('m', RandomForestClassifier())])

In [61]:
X = df.drop("num", axis=1)
y = df["num"]

In [62]:
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('%.3f (%.3f)' % (np.mean(scores), np.std(scores)))

0.796 (0.072)
