In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection, linear_model, metrics

In [4]:
# set pandas options for better viewing later
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


## Objective: 
Based on typically observed datapoints, predict if (an) individual(s) with varying characteristics is at risk for heart disease.

## Data Dictionary

- Age: age of the patient [years]
- Sex: sex of the patient [M: Male, F: Female] / [0: Female, 1: Male]
- ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
- RestingBP: resting blood pressure [mm Hg]
- Cholesterol: serum cholesterol [mm/dl]
- FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
- RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
- MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
- ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
- Oldpeak: oldpeak = ST [Numeric value measured in depression]
- ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
- HeartDisease: output class [1: heart disease, 0: Normal]

In [5]:
# import data
heart = pd.read_csv('C:/Users/Magnus/Desktop/School-Stuff/General-Assembly/GitBash/DS/unit-4_project/heart.csv')

In [6]:
# rename some columns for easier reference
heart = heart.rename(columns={
    'Age': 'age',
    'RestingBP': 'resting_bp',
    'Cholesterol': 'cholesterol',
    'FastingBS': 'fastingbs',
    'MaxHR': 'max_hr',
    'Oldpeak': 'oldpeak',
    'HeartDisease': 'heart_disease'
})

In [7]:
heart.head()

Unnamed: 0,age,Sex,ChestPainType,resting_bp,cholesterol,fastingbs,RestingECG,max_hr,ExerciseAngina,oldpeak,ST_Slope,heart_disease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [8]:
# create dummies for the codified non-ineger parameters
heart_sex_dummies = pd.get_dummies(heart['Sex'], prefix='sex')
heart_pain_dummies = pd.get_dummies(heart['ChestPainType'], prefix='pain_type')
heart_pain_ecg = pd.get_dummies(heart['RestingECG'], prefix='rest_ecg')
heart_pain_ex_angina = pd.get_dummies(heart['ExerciseAngina'], prefix='exercise_angina')
heart_pain_st_slope = pd.get_dummies(heart['ST_Slope'], prefix='st_slope')

In [9]:
# combine dummies into one dataframe
dummies = pd.concat([
    heart_sex_dummies, 
    heart_pain_dummies, 
    heart_pain_ecg, 
    heart_pain_ex_angina, 
    heart_pain_st_slope
], axis=1)

In [151]:
dummies.head(5)

Unnamed: 0,sex_F,sex_M,pain_type_ASY,pain_type_ATA,pain_type_NAP,pain_type_TA,rest_ecg_LVH,rest_ecg_Normal,rest_ecg_ST,exercise_angina_N,exercise_angina_Y,st_slope_Down,st_slope_Flat,st_slope_Up
0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [11]:
# combine original data with dummies
heart_dummies = pd.concat([heart, dummies], axis=1)
heart_dummies.head()

Unnamed: 0,age,Sex,ChestPainType,resting_bp,cholesterol,fastingbs,RestingECG,max_hr,ExerciseAngina,oldpeak,ST_Slope,heart_disease,sex_F,sex_M,pain_type_ASY,pain_type_ATA,pain_type_NAP,pain_type_TA,rest_ecg_LVH,rest_ecg_Normal,rest_ecg_ST,exercise_angina_N,exercise_angina_Y,st_slope_Down,st_slope_Flat,st_slope_Up
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [14]:
# some EDA looking at the numbers of the dummies
heart_dummies.groupby(by='heart_disease')[['sex_F', 'sex_M', 'pain_type_ASY', 'pain_type_ATA',
       'pain_type_NAP', 'pain_type_TA', 'rest_ecg_LVH', 'rest_ecg_Normal',
       'rest_ecg_ST', 'exercise_angina_N', 'exercise_angina_Y',
       'st_slope_Down', 'st_slope_Flat', 'st_slope_Up']].agg(['mean'])

Unnamed: 0_level_0,sex_F,sex_M,pain_type_ASY,pain_type_ATA,pain_type_NAP,pain_type_TA,rest_ecg_LVH,rest_ecg_Normal,rest_ecg_ST,exercise_angina_N,exercise_angina_Y,st_slope_Down,st_slope_Flat,st_slope_Up
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
heart_disease,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
0,0.34878,0.65122,0.253659,0.363415,0.319512,0.063415,0.2,0.65122,0.14878,0.865854,0.134146,0.034146,0.192683,0.773171
1,0.098425,0.901575,0.771654,0.047244,0.141732,0.03937,0.208661,0.561024,0.230315,0.377953,0.622047,0.096457,0.75,0.153543


In [15]:
# some EDA looking at the numbers of the non-dummies
heart_dummies.groupby(by='heart_disease')[['age', 'resting_bp',
                                           'cholesterol', 'fastingbs', 'max_hr',
                                          'oldpeak']].agg(['mean', 'min', 'max'])

Unnamed: 0_level_0,age,age,age,resting_bp,resting_bp,resting_bp,cholesterol,cholesterol,cholesterol,fastingbs,fastingbs,fastingbs,max_hr,max_hr,max_hr,oldpeak,oldpeak,oldpeak
Unnamed: 0_level_1,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max
heart_disease,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0,50.55122,28,76,130.180488,80,190,227.121951,0,564,0.107317,0,1,148.15122,69,202,0.408049,-1.1,4.2
1,55.899606,31,77,134.185039,0,200,175.940945,0,603,0.334646,0,1,127.655512,60,195,1.274213,-2.6,6.2


In [16]:
# due to the over-representation of males with heart disease, I was curious about how much of the sample they represent
heart_dummies['Sex'].value_counts(normalize=True)

M    0.78976
F    0.21024
Name: Sex, dtype: float64

### Some observations:

55% of samples have heart_disease

Age Range is from 28 to 77, with no significant separation in average age

90% of heart disease cases are men despite making up only 79% of the samples

No significant separation in average resting Resting BP

heart disease cases typically have lower cholesterol

heart disease cases typically have lower max heart rates

33% of heart disease cases have fasting blood sugar over 120 mg/dl

77% of heart disease cases have ASY pain type

56% of heart disease cases have normal resting ECG

62%  of heart disease cases have Exercise Angina

Higher oldpeak positively associated with higher risk of heart disease

In [150]:
# We are trying to predict a binaric variable, so we will use a logistic regression
# I have used all of the parameters as features for my model
# for whatever reason, I had to increase max iterations to make the code work
logreg = LogisticRegression(max_iter=1000)

feature_cols = ['age', 'resting_bp','cholesterol', 'fastingbs', 'max_hr', 'oldpeak',
                'sex_F', 'sex_M', 'pain_type_ASY', 'pain_type_ATA', 'pain_type_NAP',
                'pain_type_TA', 'rest_ecg_LVH', 'rest_ecg_Normal', 'rest_ecg_ST',
                'exercise_angina_N', 'exercise_angina_Y', 'st_slope_Down', 'st_slope_Flat',
                'st_slope_Up'
               ]

X = heart_dummies[feature_cols]
y = heart_dummies['heart_disease']

# First I'll fit all of the data and see what results we get
logreg.fit(X,y)
pred = logreg.predict(X)
logreg.score(X,y)*100

87.25490196078431

## Observation:
With all of the data fitted, we have an accuracy of 87%

In [45]:
# Lets stack the predcitions and their probabilities against the actual data
heart_dummies['predict_prob'] = logreg.predict_proba(X)[:, 1]
heart_dummies['predict'] = logreg.predict(X)
heart_dummies[['heart_disease', 'predict', 'predict_prob']].head(10)

Unnamed: 0,heart_disease,predict,predict_prob
0,0,0,0.03493
1,1,0,0.254823
2,0,0,0.045156
3,1,1,0.839502
4,0,0,0.093832
5,0,0,0.030736
6,0,0,0.011728
7,0,0,0.061855
8,1,1,0.941827
9,0,0,0.012803


In [136]:
false_neg = heart_dummies[heart_dummies['heart_disease'] > heart_dummies['predict']]
false_neg.shape

(48, 28)

In [137]:
false_pos = heart_dummies[heart_dummies['heart_disease'] < heart_dummies['predict']]
false_pos.shape

(69, 28)

In [139]:
f'Percent false readings: {((false_neg.shape[0]+false_pos.shape[0])/heart_dummies.shape[0])*100} %'

'Percent false readings: 12.745098039215685 %'

In [140]:
# Going with a test-train-split:
logreg1 = LogisticRegression(max_iter=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

logreg1.fit(X_train, y_train);

In [152]:
# scoring the train-test-split model
y_pred = logreg1.predict(X_test)
logreg1.score(X_test, y_test)

0.8652173913043478

In [153]:
# Build a confusion matrix to send more potential false negative to coming back as positive
logit_simple = linear_model.LogisticRegression(max_iter=1000, C=1e9).fit(X_train, y_train)
logit_pred_proba = logit_simple.predict_proba(X_test)[:,1]
tn, fp, fn, tp = metrics.confusion_matrix(y_true=y_test, y_pred=logit_pred_proba > .3).ravel() 
(tn, fp, fn, tp)

(83, 15, 13, 119)

In [154]:
# percent of false readings
(fp+fn)/len(y_test)*100

12.173913043478262

In [155]:
# percent of 'true' readings
(tn+tp)/len(y_test)*100

87.82608695652175

In [118]:
print(np.array(['age', 'resting_bp', 'cholesterol', 'fastingbs', 'max_hr', 'oldpeak', 'sex f',
         'sex m', 'pain type asy', 'pain type ata', 'pain type nap', 'pain type ta',
         'rest ecg lvh', 'rest ecg normal', 'rest egc st', 'exercise angine n',
         'exercise andgina y', 'st slope down', 'st slope flat', 'st slope up']))

['age' 'resting_bp' 'cholesterol' 'fastingbs' 'max_hr' 'oldpeak' 'sex f'
 'sex m' 'pain type asy' 'pain type ata' 'pain type nap' 'pain type ta'
 'rest exg lvh' 'rest ecg normal' 'rest egc st' 'exercise angine n'
 'exercise andgina y' 'st slope down' 'st slop flat' 'st slope up']


In [135]:
me = [[31, 80, 160, 0, 190, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1]]
print(logit_simple.predict(me))
print(logit_simple.predict_proba(me))

[0]
[[0.75145737 0.24854263]]


## Final Thoughts:
Based on our findings with the confusion matrix possibly creating more false positives, I am personally not presently at risk for heart disease.

In the future, we could expand the number of observations and take in more health data such as:
- Is the patient a smoker?
- Do they exercise regularly?
- Body-mass Index
- etc.

Additionally, I could clean up the dummies to bring it closer to a binary. In particular:
- Sex could be 0=male 1=female
- Pain types could have three columns for the actual pains, with asymptomatic being all 0s
- same with resting ecg, st slop, exercise angina