## Previous Notebook in this Project: [02_EDA](https://github.com/omi-singh17/Cardio-Disease-Prediction/blob/main/02%20Exploratory%20Data%20Analysis/02_EDA.ipynb "Exploratory Data Anlysis")


# Imports

In [124]:
# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

#import pickle
import cloudpickle


import os

# Data Load

In [125]:
cvd = pd.read_csv('../Post EDA Data/cvd.csv')

In [126]:
cvd.head()

Unnamed: 0,gender,age,ethnicity,income,household_size,marital_status,bp_regularity,systolic_bp,diastolic_bp,bmi,...,total_cholesterol,diabetes_diagnosis,diabetes_diagnosis_age,kidney_fail,fam_heart_issues,cig_smoker,cig_quit,heart_issue,cig_quit_days_clean,bad_cholesterol
0,2.0,85.0,3.0,4.0,1.0,2.0,1.0,,,,...,,2.0,-99.0,2.0,2.0,2.0,-99.0,2.0,-99.0,
1,2.0,44.0,4.0,11.0,4.0,1.0,1.0,144.0,74.0,30.9,...,105.0,2.0,-99.0,2.0,1.0,2.0,-99.0,1.0,-99.0,66.0
2,1.0,70.0,3.0,11.0,2.0,1.0,1.0,138.0,60.0,24.74,...,147.0,1.0,63.0,2.0,2.0,2.0,-99.0,2.0,-99.0,88.0
3,1.0,73.0,3.0,12.0,2.0,1.0,1.0,130.0,68.0,30.63,...,186.0,2.0,-99.0,2.0,2.0,2.0,-99.0,2.0,-99.0,137.0
4,2.0,41.0,4.0,7.0,1.0,5.0,,,,,...,,2.0,-99.0,2.0,2.0,2.0,-99.0,2.0,-99.0,


# Data Cleaning

We start by doing some additional cleaning steps continuing from the first notebook. We handle categorical data and bring uniformity across features representing yes/no during this step.

In [127]:
df = cvd.copy()

In [128]:
df['heart_issue'] = df['heart_issue'].replace(2.0,0)

In [129]:
#df['gender'] = df['gender'].replace([1.0,2.0],['M','F'])


df['income'] = df['income'].replace([1.0,2.0,3.0,4.0,13.0],'< 20,000')
df['income'] = df['income'].replace([5.0,6.0,12.0],'20,000 to < 35,000')
df['income'] = df['income'].replace([7.0,8.0,9.0],'35,000 to < 65,000')
df['income'] = df['income'].replace([10.0,11.0,14.0],'65,000 to 100,000')
df['income'] = df['income'].replace([15.0],'100,000 <')



In [130]:
df['marital_status'] = df['marital_status'].replace([1.0,2.0,3.0,4.0,5.0,6.0],['Married','Widowed'
                                                                             ,'Divorced','Separated','Never Married','Living with Partner'])

df['ethnicity'] = df['ethnicity'].replace([1.0,2.0,3.0,4.0,5.0],['Mexcian American','Other Hispanic'
                                                                             ,'Non-Hispanic White','Non-Hispanic Black','Other or Multi'])

In [131]:
#df['diabetes_diagnosis'] = df['diabetes_diagnosis'].replace([1.0,2.0,3.0],['Yes','No','At Risk'])
df['diabetes_diagnosis_age'] = df['diabetes_diagnosis_age'].replace([666.0],0.0) #indicates person was less than 1 year old

#df['kidney_fail'] = df['kidney_fail'].replace([1.0,2.0],['Yes','No'])
#df['cig_smoker'] = df['cig_smoker'].replace([1.0,2.0],['Yes','No'])
df['cig_quit'] = df['cig_quit'].replace([1.0,2.0],2.0)
df['cig_quit'] = df['cig_quit'].replace([3.0],1.0)

In [132]:
df['fam_heart_issues'] = df['fam_heart_issues'].replace([2.0,7.0,9.0],0)

# Train-Test Split

Before proceeding with any data manipulation steps we split our data into train and test sets to make sure that test data is left untouched.

In [133]:
#Dividing data into features X and Label y
X = df.drop(['heart_issue'] , axis=1)
y = df['heart_issue']

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

# Imputation

This is where we impute missing values grouped by gender, using mode for categorical and median for numerical values.
Note: The cells have been commented out as individual steps have been combined into pipeline at end.

In [135]:
# def custom_imputer(df):
# ### Replcae missing values with mode and median for categorical and numerical values respectively, grouped by gender ###


#     cat_miss_cols = ['gender', 'ethnicity', 'income', 'household_size','marital_status', 'bp_regularity', 'diabetes_diagnosis','kidney_fail',  'fam_heart_issues', 'cig_smoker', 'cig_quit']


#     for col in cat_miss_cols:
#         #df[col].fillna(df.groupby('gender')[col].transform(lambda x:x.value_counts().index[0]), inplace=True)
#         df[col] = df[col].fillna(df.groupby('gender')[col].transform(lambda x:x.value_counts().index[0]))
        
        
        
#     cont_miss_cols = ['age','systolic_bp', 'diastolic_bp', 'bmi','waist_size', 'good_cholesterol', 'total_cholesterol', 'diabetes_diagnosis_age', 'cig_quit_days_clean','bad_cholesterol']

#     for col2 in cont_miss_cols:
#         df[col2] = df[col2].fillna(df.groupby('gender')[col2].transform('median'))
        
#     return df
        
        

In [136]:
# custom_imputation_transfromer = FunctionTransformer(custom_imputer)

In [137]:
# custom_imputation_transfromer.fit_transform(X_train)

# Encoding

We have both ordinal and non-ordinal data. For income brackets OrdinalEncoder has been used, while for ethnicity and marital_status OneHotEncoder has been used.

In [138]:
# incomes =['< 20,000', '20,000 to < 35,000', '35,000 to < 65,000', '65,000 to 100,000', '100,000 <']

# ordinal_enc = OrdinalEncoder(categories = [incomes])

In [139]:
# X_train['income'] = ordinal_enc.fit_transform(X_train[['income']])

In [140]:
# onehot_enc = OneHotEncoder(drop='first', sparse_output = False).set_output(transform = 'pandas')

In [141]:
# onehot_transform = onehot_enc.fit_transform(X_train[['ethnicity', 'marital_status']])

In [142]:
# X_train = pd.concat([X_train, onehot_transform], axis = 1).drop(columns = ['ethnicity', 'marital_status'])

# Scaling

We use StandardScaler to scale our data here as during EDA we noticed the spread and skewness in some of features. This scaling method while bringing uniformity to features ensures that spread is captured as well.

In [143]:
# std_scaler = StandardScaler()

In [144]:
# X_train = pd.DataFrame(std_scaler.fit_transform(X_train), columns = X_train.columns)

In [145]:
# X_train.head(5).T

# Pipeline

We create our preprocessing pipeline below, the above steps have been combined into pipeline for ease of use and reproducibility

In [146]:
from sklearn.base import BaseEstimator, TransformerMixin

class imputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        cat_miss_cols = ['gender', 'ethnicity', 'income', 'household_size','marital_status', 'bp_regularity', 'diabetes_diagnosis','kidney_fail',  'fam_heart_issues', 'cig_smoker', 'cig_quit']


        for col in cat_miss_cols:
        #df[col].fillna(df.groupby('gender')[col].transform(lambda x:x.value_counts().index[0]), inplace=True)
            X[col] = X[col].fillna(X.groupby('gender')[col].transform(lambda x:x.value_counts().index[0]))
        
        cont_miss_cols = ['age','systolic_bp', 'diastolic_bp', 'bmi','waist_size', 'good_cholesterol', 'total_cholesterol', 'diabetes_diagnosis_age', 'cig_quit_days_clean','bad_cholesterol']

        for col2 in cont_miss_cols:
            X[col2] = X[col2].fillna(X.groupby('gender')[col2].transform('median'))
        
        return X
        
    
class encoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        incomes =['< 20,000', '20,000 to < 35,000', '35,000 to < 65,000', '65,000 to 100,000', '100,000 <']
        ordinal_enc = OrdinalEncoder(categories = [incomes])
        X['income'] = ordinal_enc.fit_transform(X[['income']])
                
        onehot_enc = OneHotEncoder(drop='first', sparse_output = False).set_output(transform = 'pandas')
        onehot_transform = onehot_enc.fit_transform(X[['ethnicity', 'marital_status']])
        X = pd.concat([X , onehot_transform], axis = 1).drop(columns = ['ethnicity', 'marital_status'])
        
        return X
    
class scaler():
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        std_scaler = StandardScaler()
        X = pd.DataFrame(std_scaler.fit_transform(X), columns = X.columns)
        
        return X       
        
    

In [147]:
preprocessing_pipeline = Pipeline(steps=[
    ('gender_grouped_mode_and_median_imputer',imputer())
    , ('ordinal_and_onehot_encoder', encoder())
    , ('standard_scaler',scaler())
    ])

In [148]:
type(preprocessing_pipeline)

sklearn.pipeline.Pipeline

In [149]:
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)

In [150]:
X_train_transformed

Unnamed: 0,gender,age,income,household_size,bp_regularity,systolic_bp,diastolic_bp,bmi,waist_size,good_cholesterol,...,bad_cholesterol,ethnicity_Non-Hispanic Black,ethnicity_Non-Hispanic White,ethnicity_Other Hispanic,ethnicity_Other or Multi,marital_status_Living with Partner,marital_status_Married,marital_status_Never Married,marital_status_Separated,marital_status_Widowed
0,-1.035002,0.478358,1.050456,-0.702410,-0.164303,0.770005,1.945523,0.620986,1.154963,-0.762502,...,0.952701,-0.520069,1.153951,-0.329471,-0.337169,-0.293856,0.974583,-0.472269,-0.183771,-0.308372
1,0.966181,0.256886,0.314348,-0.106857,-0.164303,-0.118809,0.159853,0.372786,-0.305283,0.065140,...,1.348559,-0.520069,1.153951,-0.329471,-0.337169,-0.293856,0.974583,-0.472269,-0.183771,-0.308372
2,-1.035002,1.696456,-0.421759,-0.702410,-0.164303,-0.007707,0.159853,-0.791392,0.037164,-1.781139,...,0.730031,-0.520069,1.153951,-0.329471,-0.337169,-0.293856,-1.026080,-0.472269,-0.183771,3.242832
3,0.966181,0.865934,-0.421759,-0.106857,-0.164303,0.214496,-5.684161,1.709818,2.169381,0.192470,...,-0.135907,-0.520069,1.153951,-0.329471,-0.337169,-0.293856,0.974583,-0.472269,-0.183771,-0.308372
4,0.966181,-1.404157,-1.157866,-0.106857,-0.164303,-0.896520,0.809187,-0.255102,-0.337589,0.638124,...,0.334174,-0.520069,1.153951,-0.329471,-0.337169,-0.293856,0.974583,-0.472269,-0.183771,-0.308372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23810,-1.035002,-1.071948,0.314348,-1.297964,-0.164303,-0.007707,0.159853,-0.185665,0.037164,-0.444178,...,-0.036943,1.922820,-0.866588,-0.329471,-0.337169,-0.293856,-1.026080,2.117436,-0.183771,-0.308372
23811,0.966181,-1.514893,-0.421759,-0.106857,-0.164303,-1.118723,-0.651816,-0.794347,-0.854490,-0.125854,...,-1.348221,1.922820,-0.866588,-0.329471,-0.337169,-0.293856,-1.026080,2.117436,-0.183771,-0.308372
23812,0.966181,0.699830,0.314348,-0.702410,-0.164303,-0.118809,0.322186,0.158565,0.476530,1.593096,...,-1.719337,-0.520069,1.153951,-0.329471,-0.337169,-0.293856,0.974583,-0.472269,-0.183771,-0.308372
23813,-1.035002,-0.186059,1.786563,1.084250,-0.164303,-1.229825,-0.489482,-0.343745,0.121161,0.574459,...,1.224853,-0.520069,1.153951,-0.329471,-0.337169,-0.293856,0.974583,-0.472269,-0.183771,-0.308372


# Preprocessing Pipeline

In [151]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23815 entries, 0 to 23814
Data columns (total 28 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   gender                              23815 non-null  float64
 1   age                                 23815 non-null  float64
 2   income                              23815 non-null  float64
 3   household_size                      23815 non-null  float64
 4   bp_regularity                       23815 non-null  float64
 5   systolic_bp                         23815 non-null  float64
 6   diastolic_bp                        23815 non-null  float64
 7   bmi                                 23815 non-null  float64
 8   waist_size                          23815 non-null  float64
 9   good_cholesterol                    23815 non-null  float64
 10  total_cholesterol                   23815 non-null  float64
 11  diabetes_diagnosis                  23815

In [152]:
X_train_transformed.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
gender,-1.035002,0.966181,-1.035002,0.966181,0.966181,-1.035002,0.966181,0.966181,0.966181,0.966181
age,0.478358,0.256886,1.696456,0.865934,-1.404157,1.364247,0.146149,1.696456,-0.241427,-0.352163
income,1.050456,0.314348,-0.421759,-0.421759,-1.157866,-0.421759,0.314348,-1.157866,1.050456,-0.421759
household_size,-0.70241,-0.106857,-0.70241,-0.106857,-0.106857,-1.297964,1.679803,-1.297964,1.08425,1.679803
bp_regularity,-0.164303,-0.164303,-0.164303,-0.164303,-0.164303,-0.164303,-0.164303,-0.164303,-0.164303,-0.164303
systolic_bp,0.770005,-0.118809,-0.007707,0.214496,-0.89652,0.214496,-1.56313,0.325598,-0.674317,-0.563215
diastolic_bp,1.945523,0.159853,0.159853,-5.684161,0.809187,-0.81415,-0.327149,1.296189,-0.002481,-0.002481
bmi,0.620986,0.372786,-0.791392,1.709818,-0.255102,-0.949472,0.956352,1.089317,0.971126,0.372786
waist_size,1.154963,-0.305283,0.037164,2.169381,-0.337589,-0.49912,0.502375,-0.195441,0.857745,0.541143
good_cholesterol,-0.762502,0.06514,-1.781139,0.19247,0.638124,1.593096,-0.571508,-0.635172,-0.316848,0.3198


# Saving Data and Preprocessing Pipeline

In [153]:
# save the X and y files
datapath = '../Post Preprocessing'

datapath_X = os.path.join(datapath, 'X.csv')
datapath_y = os.path.join(datapath, 'y.csv')
X.to_csv(datapath_X, index=False)
y.to_csv(datapath_y, index=False)

In [154]:
#save the preprocessing pipeline for Modeling step

cloudpickle.dump(preprocessing_pipeline, open('../Post Preprocessing/preprocessing_pipeline', 'wb'))

## Next Notebook in this Project: [04_Modeling](https://github.com/omi-singh17/Cardio-Disease-Prediction/blob/main/04%20Modeling/04_Modeling.ipynb)