# Install Modules

In [16]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

# Load Datasets

In [17]:
df = pd.read_csv('framingham_heart_disease.csv')
df

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
4235,0,48,2.0,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,0,44,1.0,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,,0


In [18]:
df.dropna(inplace=True)

In [19]:
X = df[['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
y = df['TenYearCHD']

# Split Data

In [20]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Build Model

In [25]:
dtc = DecisionTreeClassifier()

In [26]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [27]:
dtc.fit(x_train, y_train)

In [28]:
y_pred = dtc.predict(x_test)

# Metrix Analysis

In [29]:
confusion_matrix(y_test, y_pred)

array([[659, 121],
       [ 90,  44]], dtype=int64)

In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86       780
           1       0.27      0.33      0.29       134

    accuracy                           0.77       914
   macro avg       0.57      0.59      0.58       914
weighted avg       0.79      0.77      0.78       914



# Analysis

In [31]:
dtc.feature_importances_

array([0.01977787, 0.14310446, 0.04380903, 0.00801315, 0.04325371,
       0.00559482, 0.00233854, 0.00987184, 0.00244911, 0.16378239,
       0.11747545, 0.09941992, 0.12352005, 0.086618  , 0.13097165])

In [33]:
features = pd.DataFrame(dtc.feature_importances_, index = X.columns)

In [35]:
features

Unnamed: 0,0
male,0.019778
age,0.143104
education,0.043809
currentSmoker,0.008013
cigsPerDay,0.043254
BPMeds,0.005595
prevalentStroke,0.002339
prevalentHyp,0.009872
diabetes,0.002449
totChol,0.163782
