In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
heart = pd.read_csv('heart.csv')
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


A lot of '?' signs can be seen in the head. Analysis will be performed to see which columns can be salvaged and which if any should be dropped.

In [3]:
heart.describe()

Unnamed: 0,age,sex,cp,oldpeak,num
count,294.0,294.0,294.0,294.0,294.0
mean,47.826531,0.72449,2.982993,0.586054,0.360544
std,7.811812,0.447533,0.965117,0.908648,0.480977
min,28.0,0.0,1.0,0.0,0.0
25%,42.0,0.0,2.0,0.0,0.0
50%,49.0,1.0,3.0,0.0,0.0
75%,54.0,1.0,4.0,1.0,1.0
max,66.0,1.0,4.0,5.0,1.0


In [4]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         294 non-null    int64  
 1   sex         294 non-null    int64  
 2   cp          294 non-null    int64  
 3   trestbps    294 non-null    object 
 4   chol        294 non-null    object 
 5   fbs         294 non-null    object 
 6   restecg     294 non-null    object 
 7   thalach     294 non-null    object 
 8   exang       294 non-null    object 
 9   oldpeak     294 non-null    float64
 10  slope       294 non-null    object 
 11  ca          294 non-null    object 
 12  thal        294 non-null    object 
 13  num         294 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 32.3+ KB


Checking for missing data

In [24]:
cols = list(heart.columns)
for col in cols:
    print(col, any(heart[col].isna()))

age False
sex False
cp False
trestbps False
chol False
fbs False
restecg False
thalach False
exang False
oldpeak False
slope False
ca False
thal False
num        False


Counting values in each column to see if anything pops out

In [25]:
value_counts = {}

for col in cols:
    counts = {}
    for value in heart[col]:
        counts[value] = counts.get(value, 0) + 1  
    value_counts[col] = counts
    print(col,':', counts)

age : {28: 1, 29: 2, 30: 1, 31: 2, 32: 4, 33: 2, 34: 4, 35: 5, 36: 5, 37: 8, 38: 7, 39: 11, 40: 7, 41: 11, 42: 7, 43: 12, 44: 7, 45: 8, 46: 13, 47: 10, 48: 19, 49: 15, 50: 12, 51: 9, 52: 17, 53: 12, 54: 25, 55: 15, 56: 10, 57: 5, 58: 9, 59: 8, 60: 2, 61: 2, 62: 2, 63: 1, 65: 3, 66: 1}
sex : {1: 213, 0: 81}
cp : {2: 106, 1: 11, 3: 54, 4: 123}
trestbps : {'130': 54, '120': 65, '140': 50, '170': 5, '100': 6, '105': 1, '110': 21, '125': 8, '150': 23, '98': 1, '112': 3, '145': 5, '190': 1, '160': 20, '115': 2, '142': 1, '180': 6, '132': 1, '135': 5, '?': 1, '108': 1, '124': 2, '113': 1, '122': 2, '92': 1, '118': 2, '106': 1, '200': 1, '138': 1, '136': 1, '128': 1, '155': 1}
chol : {'132': 1, '243': 2, '?': 23, '237': 4, '219': 3, '198': 3, '225': 3, '254': 2, '298': 2, '161': 1, '214': 2, '220': 3, '160': 3, '167': 1, '308': 3, '264': 3, '166': 1, '340': 1, '209': 2, '260': 4, '211': 4, '173': 1, '283': 1, '194': 2, '223': 3, '315': 1, '275': 5, '297': 3, '292': 3, '182': 3, '200': 2, '204'

Picking out columns that have more than half of the values set to '?'

In [37]:
columns_drop = []
for col in cols:
    if '?' in value_counts[col].keys():
        if value_counts[col]['?'] > 294/2:
            columns_drop.append(col)
columns_drop

['slope', 'ca', 'thal']

Copying heart dataframe to a new variable and dropping columns

In [38]:
dataset = heart.copy()

for col in columns_drop:
    dataset.drop(col, axis='columns', inplace=True)

dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130,132,0,2,185,0,0.0,0
1,29,1,2,120,243,0,0,160,0,0.0,0
2,29,1,2,140,?,0,0,170,0,0.0,0
3,30,0,1,170,237,0,1,170,0,0.0,0
4,31,0,2,100,219,0,1,150,0,0.0,0


In [39]:
dataset['num       ']

0      0
1      0
2      0
3      0
4      0
      ..
289    1
290    1
291    1
292    1
293    1
Name: num       , Length: 294, dtype: int64

Renaming target column name

In [41]:
dataset.rename(columns={'num       ': 'target'}, inplace=True)
dataset

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target
0,28,1,2,130,132,0,2,185,0,0.0,0
1,29,1,2,120,243,0,0,160,0,0.0,0
2,29,1,2,140,?,0,0,170,0,0.0,0
3,30,0,1,170,237,0,1,170,0,0.0,0
4,31,0,2,100,219,0,1,150,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
289,52,1,4,160,331,0,0,94,1,2.5,1
290,54,0,3,130,294,0,1,100,1,0.0,1
291,56,1,4,155,342,1,0,150,1,3.0,1
292,58,0,2,180,393,0,0,110,1,1.0,1
