In [1]:
import pandas as pd
from pandas import DataFrame

file_path = 'cardio_train_dataset.csv'

cardio_data_frame = pd.read_csv(file_path)

cardio_data_frame.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,M,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,F,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,F,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,M,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,F,156,56.0,100,60,1,1,0,0,0,0


## Categorical Feature Mapping

we will map our categorical feature to numerical values

In [2]:
conversion_dictionary = {'F': 1, 'M': 2}

cardio_data_frame = cardio_data_frame.replace({"gender" : conversion_dictionary})
cardio_data_frame.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


## Data Imputation


In [3]:
cardio_data_frame = cardio_data_frame.dropna()

cardio_data_frame = cardio_data_frame.drop('id', axis=1)

cardio_data_frame['age'] = round(cardio_data_frame['age'] / 365)

cardio_data_frame['bmi'] = round(cardio_data_frame['weight'] / (cardio_data_frame['height'] / 100)**2)

cardio_data_frame.head()


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,2,168,62.0,110,80,1,1,0,0,1,0,22.0
1,55.0,1,156,85.0,140,90,3,1,0,0,1,1,35.0
2,52.0,1,165,64.0,130,70,3,1,0,0,0,1,24.0
3,48.0,2,169,82.0,150,100,1,1,0,0,1,1,29.0
4,48.0,1,156,56.0,100,60,1,1,0,0,0,0,23.0


# Outliers Standardization

In [4]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(cardio_data_frame)

cardio_data_frame_std = scaler.transform(cardio_data_frame)


#cardio_data_frame.mean(axis=0)

#cardio_data_frame.std(axis=0)

In [5]:
cardio_features = ['age', 'gender', 'height','weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi']

X = cardio_data_frame[cardio_features]
X.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,53.338686,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,27.548886
std,6.765294,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,6.099019
min,30.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,3.0
25%,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,24.0
50%,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,26.0
75%,58.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,30.0
max,65.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,299.0


### Y Axis
 - Presence or absence of cardiovascular disease | Target Variable | cardio | binary

In [6]:
Y = cardio_data_frame.cardio
Y.describe()

count    70000.000000
mean         0.499700
std          0.500003
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: cardio, dtype: float64

In [7]:
from sklearn.model_selection import train_test_split

trainX, validationX, trainY, validationY = train_test_split(X, Y, random_state=0)

 # 1. Decision Tree Model Implementation

In [8]:
from sklearn.tree import  DecisionTreeClassifier

# Define model. Specify a number for random_state to ensure same results each run
cardio_decision_tree_model = DecisionTreeClassifier(random_state=1)

#Fitting Model
cardio_decision_tree_model.fit(trainX, trainY)

# cardio_decision_tree_model.predict(validationX)

DecisionTreeClassifier(random_state=1)

### Mean Absolute Error

In [9]:
# from sklearn.metrics import mean_absolute_errorroc_curveroc_auc_scoreroc_curve

# valPredictions = cardio_decision_tree_model.predict(validationX)
# mean_absolute_error(validationY, valPredictions)



 # 2. KNN Model Implementation

In [10]:
from sklearn.neighbors import KNeighborsClassifier

cardio_decision_knn_model = KNeighborsClassifier()

cardio_decision_knn_model.fit(trainX, trainY)

# cardio_decision_knn_model.predict(validationX)


KNeighborsClassifier()

### Mean Absolute Error

In [11]:
# valPredictions = cardio_decision_knn_model.predict(validationX)
# mean_absolute_error(validationY, valPredictions)

 # 3. SVM Model Implementation

In [12]:
from sklearn.svm import SVC

cardio_decision_svc_model = SVC()

# cardio_decision_svc_model.fit(trainX, trainY)

# cardio_decision_svc_model.predict(validationX)


### Mean Absolute Error

In [13]:
# valPredictions = cardio_decision_svc_model.predict(validationX)
# mean_absolute_error(validationY, valPredictions)

 # 4. NN Model Implementation

In [14]:
from sklearn.neural_network import MLPClassifier

cardio_decision_nn_model = MLPClassifier(alpha=1e-5,
                                         hidden_layer_sizes=(25, 14), random_state=5)

cardio_decision_nn_model.fit(trainX, trainY)

# cardio_decision_nn_model.predict(validationX)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(25, 14), random_state=5)

### Mean Absolute Error

In [15]:
# valPredictions = cardio_decision_nn_model.predict(validationX)
# mean_absolute_error(validationY, valPredictions)


 # 5. Logistic Regression Model Implementation

In [16]:
from sklearn.linear_model import LogisticRegression

cardio_decision_lr_model = LogisticRegression( random_state=1, solver='saga')

cardio_decision_lr_model.fit(trainX, trainY)

# cardio_decision_lr_model.predict(validationX)



LogisticRegression(random_state=1, solver='saga')

### Mean Absolute Error

In [17]:
valPredictions = cardio_decision_lr_model.predict(validationX)
# mean_absolute_error(validationY, valPredictions)




from sklearn.model_selection import train_test_split

trainX, validationX, trainY, validationY = train_test_split(X, Y, random_state=0)

# models = {"Decision tree" : cardio_decision_tree_model,
#           "NN" : cardio_decision_nn_model,
#           "KNN" : cardio_decision_knn_model,
#           "SVM" : cardio_decision_svc_model,
#           "Logistic Regression" : cardio_decision_lr_model}
models = [cardio_decision_lr_model,
               cardio_decision_nn_model,
              cardio_decision_knn_model,
               cardio_decision_tree_model]

# scores= { }
#
# for key, value in models.items():
#     model = value
#     model.fit(trainX, trainY)
#     scores[key] = model.score(validationX, validationY)
#
#
#     scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
# scores_frame.sort_values(by=["Accuracy Score"], axis=0 ,ascending=False, inplace=True)
# scores_frame
from matplotlib import pyplot as plt
# import seaborn as sns
#
# plt.figure(figsize=(5,5))
# sns.barplot(x=scores_frame.index,y=scores_frame["Accuracy Score"])
# plt.xticks(rotation=45)

# Define a result table as a DataFrame
result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
# Train the models and record the results
for cls in models:
    model = cls.fit(trainX, trainY)
    yproba = model.predict_proba(validationX)[::,1]

    fpr, tpr, _ = roc_curve(validationY,  yproba)
    auc = roc_auc_score(validationY, yproba)

    result_table = result_table.append({'classifiers':cls.__class__.__name__,
                                        'fpr':fpr,
                                        'tpr':tpr,
                                        'auc':auc}, ignore_index=True)

    # Set name of the classifiers as index labels
result_table.set_index('classifiers', inplace=True)


fig = plt.figure(figsize=(8,6))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'],
             result_table.loc[i]['tpr'],
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))

plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()




ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/umairsaeed/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-17-72d1d63fc2f0>", line 45, in <module>
    model = cls.fit(trainX, trainY)
  File "/Users/umairsaeed/opt/anaconda3/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 179, in fit
    return self._fit(X, y)
  File "/Users/umairsaeed/opt/anaconda3/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 503, in _fit
    self._tree = KDTree(X, self.leaf_size,
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/umairsaeed/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

Duri

TypeError: object of type 'NoneType' has no len()

### Model Comparison / Accuracy Score

In [None]:
models = {"Decision tree": cardio_decision_tree_model,
          "NN": cardio_decision_nn_model,
          "KNN": cardio_decision_knn_model,
          "SVM": cardio_decision_svc_model,
          "Logistic Regression": cardio_decision_lr_model}

scores = {}

for key, value in models.items():
    model = value
    model.fit(trainX, trainY)
scores[key] = model.score(validationX, validationY)

scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0, ascending=False, inplace=True)
scores_frame

plt.figure(figsize=(5, 5))
sns.barplot(x=scores_frame.index, y=scores_frame["Accuracy Score"])
plt.xticks(rotation=45)

### AUC

In [None]:
models = [cardio_decision_lr_model,
          cardio_decision_nn_model,
          cardio_decision_knn_model,
          cardio_decision_tree_model]

result_table = pd.DataFrame(columns=['classifiers', 'fpr', 'tpr', 'auc'])
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np

# Train the models and record the results

trains = {}
for cls in models.items():
    model = cls.fit(trainX, trainY)
    yproba = model.predict_proba(validationX)[::, 1]

    fpr, tpr, _ = roc_curve(validationY, yproba)
    auc = roc_auc_score(validationY, yproba)

    result_table = result_table.append({'classifiers': cls.__class__.__name__,
                                        'fpr': fpr,
                                        'tpr': tpr,
                                        'auc': auc}, ignore_index=True)

    # Set name of the classifiers as index labels
result_table.set_index('classifiers', inplace=True)

fig = plt.figure(figsize=(8, 6))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'],
             result_table.loc[i]['tpr'],
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))

plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()