In [1]:
#****************************************
# This juptyter notebook was created for the project, Heart Disease Prediction with Neural Networks,
# from the Udemy course, Aplied Machine Learning in Healthcare found at: 
#     https://www.udemy.com/course/applied-machine-learning-for-healthcare
#
# The dataset used is a molecular biology e coli (promoter gene sequence) data set
# located in the UCI repository at
#    FILL THIS IN
#
#In this project we complete the following activities 
# 1. load libraries
# 2. import the data
# 3. Review the data
# 4. For missing data, indicated by a,('?'), replace the value with a NaN
# 5. Remove the dataframe rows that contain a NaN value (goes from 303 to 297 rows)
# 6. transform data to numeric to enable further data analysis (2 of the columns were type object
#          because of the '?' values from the original dataset import)
#    missing data
#
#
#
#

#*****************************************************************

# import libraries and check versions
import sys  # Python
import pandas as pd # Use Pandas for data handling and importing
import numpy as np
import sklearn # machine learning tools
import matplotlib # for graphing
import keras # deep learning API to build the neural network

print('Python: {}'.format(sys.version))   # use .format function to substitute a variable into a string
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))
print('Sklearn: {}'.format(sklearn.__version__))
print('Matplotlib: {}'.format(matplotlib.__version__))
print('Keras: {}'.format(keras.__version__))    

#print 'Python: {}'.format(sys.version)
#print 'Pandas: {}'.format(pd.__version__)
#print 'Numpy: {}'.format(np.__version__)
#print 'Sklearn: {}'.format(sklearn.__version__)
#print 'Matplotlib: {}'.format(matplotlib.__version__)
#print 'Keras: {}'.format(keras.__version__)






Using Theano backend.


Python: 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
Pandas: 0.25.1
Numpy: 1.16.5
Sklearn: 0.21.3
Matplotlib: 3.1.1
Keras: 2.2.4


In [2]:
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix


In [6]:
# importing the dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

# define the names for each column in our pandas Dataframe into a python list
# Get the names from the file, heart-disease.names,in the UCI repository
names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'class']

# read the csv file
cleveland = pd.read_csv(url, names=names)

# print the shape of the dataframe plus some examples
print('Shape of DataFrame: {}'.format(cleveland.shape))
print(cleveland.loc[1])

Shape of DataFrame: (303, 14)
age          67
sex           1
cp            4
trestbps    160
chol        286
fbs           0
restecg       2
thalach     108
exang         1
oldpeak     1.5
slope         2
ca          3.0
thal        3.0
class         2
Name: 1, dtype: object


In [7]:
# print the last twenty or so as data points
cleveland.loc[280:]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,class
280,57.0,1.0,4.0,110.0,335.0,0.0,0.0,143.0,1.0,3.0,2.0,1.0,7.0,2
281,47.0,1.0,3.0,130.0,253.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
282,55.0,0.0,4.0,128.0,205.0,0.0,1.0,130.0,1.0,2.0,2.0,1.0,7.0,3
283,35.0,1.0,2.0,122.0,192.0,0.0,0.0,174.0,0.0,0.0,1.0,0.0,3.0,0
284,61.0,1.0,4.0,148.0,203.0,0.0,0.0,161.0,0.0,0.0,1.0,1.0,7.0,2
285,58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,4
286,58.0,0.0,4.0,170.0,225.0,1.0,2.0,146.0,1.0,2.8,2.0,2.0,6.0,2
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
288,56.0,1.0,2.0,130.0,221.0,0.0,2.0,163.0,0.0,0.0,1.0,0.0,7.0,0
289,56.0,1.0,2.0,120.0,240.0,0.0,0.0,169.0,0.0,0.0,3.0,0.0,3.0,0


In [8]:
# remove missing data (indicated with a "?")
data = cleveland[~cleveland.isin(['?'])] # data = all the indices in cleveland that are NOT a '?'
# above - replaces the '?' with a NaN valie
data.loc[280:]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,class
280,57.0,1.0,4.0,110.0,335.0,0.0,0.0,143.0,1.0,3.0,2.0,1.0,7.0,2
281,47.0,1.0,3.0,130.0,253.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
282,55.0,0.0,4.0,128.0,205.0,0.0,1.0,130.0,1.0,2.0,2.0,1.0,7.0,3
283,35.0,1.0,2.0,122.0,192.0,0.0,0.0,174.0,0.0,0.0,1.0,0.0,3.0,0
284,61.0,1.0,4.0,148.0,203.0,0.0,0.0,161.0,0.0,0.0,1.0,1.0,7.0,2
285,58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,4
286,58.0,0.0,4.0,170.0,225.0,1.0,2.0,146.0,1.0,2.8,2.0,2.0,6.0,2
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,,7.0,0
288,56.0,1.0,2.0,130.0,221.0,0.0,2.0,163.0,0.0,0.0,1.0,0.0,7.0,0
289,56.0,1.0,2.0,120.0,240.0,0.0,0.0,169.0,0.0,0.0,3.0,0.0,3.0,0


In [9]:
# drop the rows with NaN values from the DataFrame
data = data.dropna(axis = 0)
data.loc[280:]  # looking at the output, we see that row id is removed - i.e. the id's 
# (above) in the DataFrame are not renumbered to account for the removed rows. 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,class
280,57.0,1.0,4.0,110.0,335.0,0.0,0.0,143.0,1.0,3.0,2.0,1.0,7.0,2
281,47.0,1.0,3.0,130.0,253.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
282,55.0,0.0,4.0,128.0,205.0,0.0,1.0,130.0,1.0,2.0,2.0,1.0,7.0,3
283,35.0,1.0,2.0,122.0,192.0,0.0,0.0,174.0,0.0,0.0,1.0,0.0,3.0,0
284,61.0,1.0,4.0,148.0,203.0,0.0,0.0,161.0,0.0,0.0,1.0,1.0,7.0,2
285,58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,4
286,58.0,0.0,4.0,170.0,225.0,1.0,2.0,146.0,1.0,2.8,2.0,2.0,6.0,2
288,56.0,1.0,2.0,130.0,221.0,0.0,2.0,163.0,0.0,0.0,1.0,0.0,7.0,0
289,56.0,1.0,2.0,120.0,240.0,0.0,0.0,169.0,0.0,0.0,3.0,0.0,3.0,0
290,67.0,1.0,3.0,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0,1


In [11]:
# print the shape and data type of the DataFrame
print('Shape of DataFrame: {}'.format(data.shape))
data.dtypes

Shape of DataFrame: (297, 14)


age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca           object
thal         object
class         int64
dtype: object

In [12]:
# previous cell - we see that ca and thal are type object, that is because there were
# '?' values in those columns when we did the original data import

# transform data to numeric to enable further data analysis
data = data.apply(pd.to_numeric)
data.dtypes

age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
class         int64
dtype: object