In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier  
from sklearn.svm import LinearSVC


In [2]:
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [3]:
path = '../../raw_data/CVD_cleaned.csv'

data = pd.read_csv(path)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [5]:
data.drop_duplicates(inplace=True)

In [6]:
data.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [7]:
data['Checkup'].value_counts()

Checkup
Within the past year       239295
Within the past 2 years     37210
Within the past 5 years     17442
5 or more years ago         13420
Never                        1407
Name: count, dtype: int64

In [8]:
data['Age_Category'].value_counts()

Age_Category
65-69    33425
60-64    32409
70-74    31099
55-59    28048
50-54    25090
80+      22269
40-44    21587
45-49    20963
75-79    20699
35-39    20598
18-24    18670
30-34    18425
25-29    15492
Name: count, dtype: int64

In [9]:
data.head(2)

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0


In [10]:
# Preprocess Age_Category

data['Age_Category'].value_counts().sort_values()

def age_process(x) :
    
    if x in ['25-29', '18-24', '30-34'] :
        
        return 'young'
        
    elif x in ['35-39', '40-44', '45-49', '50-54', '55-59'] :
        
        return 'adult'
        
    else :
        
        return 'old'

In [11]:
data['age_category'] = data['Age_Category'].map(lambda x : age_process(x))

In [12]:
data.drop(columns=['Age_Category', 'Checkup', 'BMI'], inplace=True)

In [13]:
data.head()

Unnamed: 0,General_Health,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Height_(cm),Weight_(kg),Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,age_category
0,Poor,No,No,No,No,No,No,Yes,Female,150.0,32.66,Yes,0.0,30.0,16.0,12.0,old
1,Very Good,No,Yes,No,No,No,Yes,No,Female,165.0,77.11,No,0.0,30.0,0.0,4.0,old
2,Very Good,Yes,No,No,No,No,Yes,No,Female,163.0,88.45,No,4.0,12.0,3.0,16.0,old
3,Poor,Yes,Yes,No,No,No,Yes,No,Male,180.0,93.44,No,0.0,30.0,30.0,8.0,old
4,Good,No,No,No,No,No,No,No,Male,191.0,88.45,Yes,0.0,8.0,4.0,0.0,old


In [14]:
X = data.drop(columns=['Heart_Disease'],axis=1)
y = data[['Heart_Disease']]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

 We have three kinds of columns : 
 
 - Numerical, I'll use RobustScaler because of outliers
 
 - Categorical Binary (Yes/No) I'll use OneHotEncoder
 
 - Categorical Nominal with an inherent order, I'll use OrdinalEncoder

In [16]:
data.columns

Index(['General_Health', 'Exercise', 'Heart_Disease', 'Skin_Cancer',
       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',
       'Height_(cm)', 'Weight_(kg)', 'Smoking_History', 'Alcohol_Consumption',
       'Fruit_Consumption', 'Green_Vegetables_Consumption',
       'FriedPotato_Consumption', 'age_category'],
      dtype='object')

In [17]:
ord_cols = ['General_Health', 'age_category']
cat_cols = ['Smoking_History','Exercise', 'Skin_Cancer','Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex']

In [18]:
General_Health_sorted = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
age_category_sorted = ['young', 'adult', 'old']

In [19]:
robust_enc = RobustScaler()
num_cols = X_train.select_dtypes(exclude=['object']).columns
X_train.select_dtypes(exclude=['object'])

Unnamed: 0,Height_(cm),Weight_(kg),Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
70831,175.0,65.77,4.0,30.0,12.0,8.0
65402,150.0,107.50,3.0,60.0,0.0,1.0
129329,191.0,90.72,0.0,30.0,4.0,4.0
140174,160.0,52.16,0.0,15.0,16.0,4.0
153049,160.0,58.97,0.0,60.0,8.0,8.0
...,...,...,...,...,...,...
133006,178.0,120.20,15.0,4.0,4.0,15.0
8443,178.0,74.84,0.0,30.0,4.0,0.0
210281,185.0,68.04,5.0,16.0,4.0,4.0
23049,168.0,56.70,0.0,60.0,16.0,2.0


In [20]:
ordinal_enc = OrdinalEncoder(categories=[General_Health_sorted , age_category_sorted],
              handle_unknown='use_encoded_value',
              unknown_value=-1)

X_train[['General_Health', 'age_category']] = ordinal_enc.fit_transform(X_train[['General_Health', 'age_category']])
X_test[['General_Health', 'age_category']] = ordinal_enc.transform(X_test[['General_Health', 'age_category']])



In [21]:
data.select_dtypes(exclude=['object'])

Unnamed: 0,Height_(cm),Weight_(kg),Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,150.0,32.66,0.0,30.0,16.0,12.0
1,165.0,77.11,0.0,30.0,0.0,4.0
2,163.0,88.45,4.0,12.0,3.0,16.0
3,180.0,93.44,0.0,30.0,30.0,8.0
4,191.0,88.45,0.0,8.0,4.0,0.0
...,...,...,...,...,...,...
308849,168.0,81.65,4.0,30.0,8.0,0.0
308850,180.0,69.85,8.0,15.0,60.0,4.0
308851,157.0,61.23,4.0,40.0,8.0,4.0
308852,183.0,79.38,3.0,30.0,12.0,0.0


In [24]:
cat_columns = X_train.select_dtypes(include=['object']).columns
X_train.select_dtypes(include=['object'])

Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Smoking_History
70831,Yes,No,No,No,No,No,Male,No
65402,No,No,No,No,"No, pre-diabetes or borderline diabetes",No,Female,No
129329,Yes,No,No,No,No,No,Male,Yes
140174,No,No,No,No,No,No,Female,Yes
153049,Yes,No,No,Yes,No,No,Female,No
...,...,...,...,...,...,...,...,...
133006,Yes,No,No,No,No,No,Male,Yes
8443,Yes,No,No,Yes,Yes,Yes,Female,Yes
210281,Yes,No,No,No,No,No,Male,No
23049,Yes,No,No,No,No,Yes,Male,No


In [25]:
ohe = OneHotEncoder(drop='if_binary', sparse=False, handle_unknown='ignore')
X_train[ohe.get_feature_names_out()] = ohe.fit_transform(X_train[cat_columns])
X_test[ohe.get_feature_names_out()] = ohe.transform(X_test[cat_columns])


df[ohe.get_feature_names_out()] = ohe.fit_transform(df[cat_columns])
X_test[ohe.get_feature_names_out()] = ohe.transform(X_test[cat_columns])



In [26]:
X_train = X_train.drop(columns=cat_columns)
X_test = X_test.drop(columns=cat_columns)

In [28]:
clf = DecisionTreeClassifier(max_depth=5, min_samples_split=2, min_samples_leaf=1)

In [30]:
y_train.value_counts()
dictio = {'Yes':1,'No':0}
y_train = y_train.replace(dictio)
y_test = y_test.replace(dictio)

In [31]:
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5)

In [32]:
scores

{'fit_time': array([0.20850468, 0.20364451, 0.20444727, 0.2069006 , 0.20870638]),
 'score_time': array([0.04149938, 0.04271269, 0.04182577, 0.04168153, 0.04249859]),
 'test_accuracy': array([0.91933656, 0.91963542, 0.92002869, 0.91968169, 0.91912649]),
 'test_precision': array([0.5026178 , 0.53125   , 0.57635468, 0.54651163, 0.48230088]),
 'test_recall': array([0.02752294, 0.03411697, 0.03354358, 0.02694954, 0.03125   ]),
 'test_f1': array([0.05218809, 0.06411638, 0.06339745, 0.05136612, 0.05869682]),
 'test_roc_auc': array([0.80877619, 0.81612625, 0.80898495, 0.81440559, 0.8176553 ])}