In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#source of data: https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/
heart_data = pd.read_csv('processed.cleveland.data.csv')

In [21]:
heart_data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [22]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             303 non-null    object 
 12  thal           303 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [23]:
#looking for null values in dataset
heart_data[heart_data.isnull().any(axis=1)]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease


In [24]:
#summary statistics, describe method of df
heart_data.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
unique,,,,,,,,,,,,5.0,4.0,
top,,,,,,,,,,,,0.0,3.0,
freq,,,,,,,,,,,,176.0,166.0,
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,,,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,,,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,,,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,,,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,,,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,,,2.0


In [25]:
#ca is listed as object, and there are 5 unique values but it ranges from 0 - 3
heart_data.ca.unique()

array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

In [26]:
#looking at the unique values under thal column
heart_data.thal.unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

In [27]:
#change the ? values to something that python can recognize as missing
#np.nan using numpy library here, replacing all the ? to nan 
heart_data=heart_data.replace('?', np.nan)

In [28]:
#changing ca column
#changing the datatype
heart_data.ca=heart_data.ca.astype('float')

In [11]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             299 non-null    float64
 12  thal           301 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(12), int64(1), object(1)
memory usage: 33.3+ KB


In [38]:
# any thing coded in number but it is categorical
# we are going to change the numbers to text so we understand cp variables better 
# cp: chest pain type
# - Value 1: typical angina
# - Value 2: atypical angina
# - Value 3: non-anginal pain
# - Value 4: asymptomatic
# using a dictionary to do this so key:value pair
# you should use inplace=True if you want current df to reflect changes you made with replace
# or you can do heart_data.cp = heart_data.cp.replace..
heart_data.cp.replace({1.0:'typical angina', 2.0:'atypical angina', 3.0:'non-anginal pain', 4.0:'asymptomatic'}, inplace=True)
#heart_data['cp'].dtypes

In [39]:
heart_data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,typical angina,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,atypical angina,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [41]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    object 
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             299 non-null    float64
 12  thal           301 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [43]:
#top means what is the most in table
heart_data.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
unique,,,4,,,,,,,,,,3.0,
top,,,asymptomatic,,,,,,,,,,3.0,
freq,,,144,,,,,,,,,,166.0,
mean,54.438944,0.679868,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,,0.937294
std,9.038662,0.467299,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,,1.228536
min,29.0,0.0,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,,0.0
25%,48.0,0.0,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,,0.0
50%,56.0,1.0,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,,0.0
75%,61.0,1.0,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,,2.0


In [45]:
heart_data.slope.replace({1.0: 'upsloping', 2.0:'flat', 3.0:'downsloping'}, inplace=True)

In [49]:
# telling python this is a categorical 
# dual variable type saved
heart_data.slope = pd.Categorical(heart_data.slope, ['upsloping', 'flat', 'downplaying'], ordered=True)

In [50]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,typical angina,145.0,233.0,1.0,2.0,150.0,0.0,2.3,,0.0,6.0,0
1,67.0,1.0,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,flat,3.0,3.0,2
2,67.0,1.0,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,flat,2.0,7.0,1
3,37.0,1.0,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,,0.0,3.0,0
4,41.0,0.0,atypical angina,130.0,204.0,0.0,2.0,172.0,0.0,1.4,upsloping,0.0,3.0,0


In [51]:
heart_data.slope.cat.codes

0     -1
1      1
2      1
3     -1
4      0
      ..
298    1
299    1
300    1
301    1
302    0
Length: 303, dtype: int8

In [52]:
heart_data.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,282,299.0,301.0,303.0
unique,,,4,,,,,,,,2,,3.0,
top,,,asymptomatic,,,,,,,,upsloping,,3.0,
freq,,,144,,,,,,,,142,,166.0,
mean,54.438944,0.679868,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,,0.672241,,0.937294
std,9.038662,0.467299,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,,0.937438,,1.228536
min,29.0,0.0,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,,0.0,,0.0
25%,48.0,0.0,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,,0.0,,0.0
50%,56.0,1.0,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,,0.0,,0.0
75%,61.0,1.0,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,,1.0,,2.0


In [None]:
# 