In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [49]:
# read csv, this is the heart disease data from codeacademy
df = pd.read_csv(r'/Users/mahinbindra/Downloads/processed.cleveland.data.csv')

- age: age in years
- sex: 1=male, 0=female
- cp: chest pain type
    - Value 1: typical angina
    - Value 2: atypical angina
    - Value 3: non-anginal pain
    - Value 4: asymptomatic
- trestbps: resting blood pressure (in mm Hg on admission to the hospital)
- chol: serum cholestoral in mg/dl
- fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg: resting electrocardiographic results
    - Value 0: normal
    - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
    - thalach: maximum heart rate achieved in an exercise test
- exang: exercise induced angina (1 = yes; 0 = no)
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
    - Value 1: upsloping
    - Value 2: flat
    - Value 3: downsloping
- ca: number of major vessels (0-3) colored by flourosopy
- thal:
    - Value 3: normal
    - Value 6: fixed defect
    - Value 7: reversable defect
- heart_disease: diagnosis of heart disease (angiographic disease status)
    - Value 0: < 50% diameter narrowing
    - Value 1: > 50% diameter narrowing "[This field] refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0)."

In [50]:
# read the first five rows of the table to understand the column
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [51]:
# descriptive statistics, helps you understand mean, median, mode, percentiles and unique values
df.describe(include = 'all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
unique,,,,,,,,,,,,5.0,4.0,
top,,,,,,,,,,,,0.0,3.0,
freq,,,,,,,,,,,,176.0,166.0,
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,,,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,,,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,,,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,,,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,,,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,,,2.0


In [52]:
# understand nulls and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             303 non-null    object 
 12  thal           303 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [53]:
# unique values
df.ca.unique()

array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

In [54]:
df.thal.unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

In [55]:
df[df.ca=='?']

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,?,7.0,1
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


In [56]:
df2 = df.copy()

In [57]:
# replace values within a specific column using numpy
df2['ca'] = df2['ca'].replace('?', np.nan)
df2['thal'] = df2['thal'].replace('?', np.nan)
# df2 = df2.replace('?', np.nan) # this will replace all values with a '?' within the entire dataset

In [58]:
df2.ca.unique()
# df2.thal.unique()

array(['0.0', '3.0', '2.0', '1.0', nan], dtype=object)

In [59]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             299 non-null    object 
 12  thal           301 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [62]:
df2['ca'] = df2['ca'].astype(float)
df2['thal'] = df2['thal'].astype(float)

In [63]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             299 non-null    float64
 12  thal           301 non-null    float64
 13  heart_disease  303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


In [64]:
# show only the rows that contain at least one NaN value
df2[df2.isnull().any(axis=1)]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,,0
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,,7.0,1
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,,2
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0


- cp: chest pain type
    - Value 1: typical angina
    - Value 2: atypical angina
    - Value 3: non-anginal pain
    - Value 4: asymptomatic

In [73]:
df2.cp.unique()

array([1., 4., 3., 2.])

In [None]:
# important to add inplace = True to make sure that the changes are reflected in the dataframe
df2.cp.replace({1.0: 'typical angine', 2.0: 'atypical angine', 3.0: 'non-anginal pain', 4.0: 'asymptomatic'}, inplace = True)

In [79]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    object 
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             299 non-null    float64
 12  thal           301 non-null    float64
 13  heart_disease  303 non-null    int64  
dtypes: float64(12), int64(1), object(1)
memory usage: 33.3+ KB


In [80]:
df2.cp.unique()

array(['typical angine', 'asymptomatic', 'non-anginal pain',
       'atypical angine'], dtype=object)

In [81]:
df2.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,typical angine,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,atypical angine,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [82]:
df2.describe(include = 'all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
unique,,,4,,,,,,,,,,,
top,,,asymptomatic,,,,,,,,,,,
freq,,,144,,,,,,,,,,,
mean,54.438944,0.679868,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.937294
std,9.038662,0.467299,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,1.228536
min,29.0,0.0,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0


In [83]:
df2.slope.unique()

array([3., 2., 1.])

In [87]:
# important to add inplace = True to make sure that the changes are reflected in the dataframe
df2.slope.replace({1.0: 'upsloping', 2.0: 'flat', 3.0: 'downsloping'}, inplace = True)

In [91]:
df2.slope.unique()

array(['downsloping', 'flat', 'upsloping'], dtype=object)

In [92]:
df2.slope = pd.Categorical(df2.slope, ['upsloping', 'flat', 'downsloping'], ordered=True)

In [94]:
df2.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,typical angine,145.0,233.0,1.0,2.0,150.0,0.0,2.3,downsloping,0.0,6.0,0
1,67.0,1.0,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,flat,3.0,3.0,2
2,67.0,1.0,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,flat,2.0,7.0,1
3,37.0,1.0,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,downsloping,0.0,3.0,0
4,41.0,0.0,atypical angine,130.0,204.0,0.0,2.0,172.0,0.0,1.4,upsloping,0.0,3.0,0


In [95]:
df2.slope.cat.codes

0      2
1      1
2      1
3      2
4      0
      ..
298    1
299    1
300    1
301    1
302    0
Length: 303, dtype: int8

In [96]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   age            303 non-null    float64 
 1   sex            303 non-null    float64 
 2   cp             303 non-null    object  
 3   trestbps       303 non-null    float64 
 4   chol           303 non-null    float64 
 5   fbs            303 non-null    float64 
 6   restecg        303 non-null    float64 
 7   thalach        303 non-null    float64 
 8   exang          303 non-null    float64 
 9   oldpeak        303 non-null    float64 
 10  slope          303 non-null    category
 11  ca             299 non-null    float64 
 12  thal           301 non-null    float64 
 13  heart_disease  303 non-null    int64   
dtypes: category(1), float64(11), int64(1), object(1)
memory usage: 31.3+ KB


In [98]:
df2.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303,299.0,301.0,303.0
unique,,,4,,,,,,,,3,,,
top,,,asymptomatic,,,,,,,,upsloping,,,
freq,,,144,,,,,,,,142,,,
mean,54.438944,0.679868,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,,0.672241,4.734219,0.937294
std,9.038662,0.467299,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,,0.937438,1.939706,1.228536
min,29.0,0.0,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,,0.0,3.0,0.0
25%,48.0,0.0,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,,0.0,3.0,0.0
50%,56.0,1.0,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,,0.0,3.0,0.0
75%,61.0,1.0,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,,1.0,7.0,2.0


In [99]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   age            303 non-null    float64 
 1   sex            303 non-null    float64 
 2   cp             303 non-null    object  
 3   trestbps       303 non-null    float64 
 4   chol           303 non-null    float64 
 5   fbs            303 non-null    float64 
 6   restecg        303 non-null    float64 
 7   thalach        303 non-null    float64 
 8   exang          303 non-null    float64 
 9   oldpeak        303 non-null    float64 
 10  slope          303 non-null    category
 11  ca             299 non-null    float64 
 12  thal           301 non-null    float64 
 13  heart_disease  303 non-null    int64   
dtypes: category(1), float64(11), int64(1), object(1)
memory usage: 31.3+ KB
