In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('data/raw/processed.cleveland.data',header=None)

In [9]:
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (303, 14)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [12]:
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df.columns = columns
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [15]:
# Replace '?' with NaN
df = df.replace('?', np.nan)

# Continuous numeric
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col])

# Categorical (convert to int first, then category)
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal','target']
for col in categorical_cols:
    df[col] = pd.to_numeric(df[col]).astype('category')
    
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       303 non-null    float64 
 1   sex       303 non-null    category
 2   cp        303 non-null    category
 3   trestbps  303 non-null    float64 
 4   chol      303 non-null    float64 
 5   fbs       303 non-null    category
 6   restecg   303 non-null    category
 7   thalach   303 non-null    float64 
 8   exang     303 non-null    category
 9   oldpeak   303 non-null    float64 
 10  slope     303 non-null    category
 11  ca        299 non-null    category
 12  thal      301 non-null    category
 13  target    303 non-null    category
dtypes: category(9), float64(5)
memory usage: 16.0 KB
None


In [16]:
print(df.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64


In [18]:
print("\nRows with missing values:")
missing_rows = df[df.isnull().any(axis=1)]
print(f"Total rows with missing values: {len(missing_rows)}")
print(missing_rows)



Rows with missing values:
Total rows with missing values: 6
      age sex cp  trestbps   chol fbs restecg  thalach exang  oldpeak slope  \
87   53.0   0  3     128.0  216.0   0       2    115.0     0      0.0     1   
166  52.0   1  3     138.0  223.0   0       0    169.0     0      0.0     1   
192  43.0   1  4     132.0  247.0   1       2    143.0     1      0.1     2   
266  52.0   1  4     128.0  204.0   1       0    156.0     1      1.0     2   
287  58.0   1  2     125.0  220.0   0       0    144.0     0      0.4     2   
302  38.0   1  3     138.0  175.0   0       0    173.0     0      0.0     1   

      ca thal target  
87     0  NaN      0  
166  NaN    3      0  
192  NaN    7      1  
266    0  NaN      2  
287  NaN    7      0  
302  NaN    3      0  


In [20]:
# Drop rows with missing values
df_clean = df.dropna()

print(f"Original dataset: {len(df)} rows")
print(f"Clean dataset: {len(df_clean)} rows")
print(f"Dropped: {len(df) - len(df_clean)} rows ({(len(df) - len(df_clean))/len(df)*100:.1f}%)")

Original dataset: 303 rows
Clean dataset: 297 rows
Dropped: 6 rows (2.0%)


In [21]:
# Verify no missing values
print(f"Missing values remaining: {df_clean.isnull().sum().sum()}")

Missing values remaining: 0


In [22]:
# Save clean dataset
df_clean.to_csv('data/processed/heart_disease_clean.csv', index=False)