link to dataset: https://archive.ics.uci.edu/dataset/45/heart+disease

csv file: https://github.com/trojrobert/Classification-of-heart-disease-uci-data-/blob/master/heart.csv

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer    # # sklearn: library for ML
from sklearn.preprocessing import StandardScaler    # to scale numeric features

# Collecting Data
data_path = "/content/heart.csv"
df = pd.read_csv(data_path)
df.shape

(303, 14)

In [None]:
df.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
# Cleaning Data
# Remove duplicates
df.drop_duplicates(inplace=True)    # note this
# inplace=True is a parameter in pandas that, when set to True, specifies that the operation should be performed directly on the DataFrame itself, rather than returning a new DataFrame
df.shape

(302, 14)

In [None]:
df.replace(0, pd.NA, inplace=True)

In [None]:
# Handle missing values
# i m put-er

imputer = SimpleImputer(strategy="mean")    # so this is the stratergy
df["age"] = imputer.fit_transform(df[["age"]])    # note double [[]]
df["chol"] = imputer.fit_transform(df[["chol"]])
df["thalach"] = imputer.fit_transform(df[["thalach"]])

In [None]:
print(df)

      age  sex  cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0    1   3       145  233.0    1        0    150.0      0      2.3   
1    37.0    1   2       130  250.0    0        1    187.0      0      3.5   
2    41.0    0   1       130  204.0    0        0    172.0      0      1.4   
3    56.0    1   1       120  236.0    0        1    178.0      0      0.8   
4    57.0    0   0       120  354.0    0        1    163.0      1      0.6   
..    ...  ...  ..       ...    ...  ...      ...      ...    ...      ...   
298  57.0    0   0       140  241.0    0        1    123.0      1      0.2   
299  45.0    1   3       110  264.0    0        1    132.0      0      1.2   
300  68.0    1   0       144  193.0    1        1    141.0      0      3.4   
301  57.0    1   0       130  131.0    0        1    115.0      1      1.2   
302  57.0    0   1       130  236.0    0        0    174.0      0      0.0   

     slope  ca  thal  target  
0        0   0     1       1  
1

In [None]:
# Integration and Transformation
# Create dummy variables for categorical features
df = pd.get_dummies(df, columns=["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"])


In [None]:
print(df)

      age  trestbps   chol  thalach  oldpeak  target  sex_0  sex_1  cp_0  \
0    63.0       145  233.0    150.0      2.3       1      0      1     0   
1    37.0       130  250.0    187.0      3.5       1      0      1     0   
2    41.0       130  204.0    172.0      1.4       1      1      0     0   
3    56.0       120  236.0    178.0      0.8       1      0      1     0   
4    57.0       120  354.0    163.0      0.6       1      1      0     1   
..    ...       ...    ...      ...      ...     ...    ...    ...   ...   
298  57.0       140  241.0    123.0      0.2       0      1      0     1   
299  45.0       110  264.0    132.0      1.2       0      0      1     0   
300  68.0       144  193.0    141.0      3.4       0      0      1     1   
301  57.0       130  131.0    115.0      1.2       0      0      1     1   
302  57.0       130  236.0    174.0      0.0       0      1      0     0   

     cp_1  ...  slope_2  ca_0  ca_1  ca_2  ca_3  ca_4  thal_0  thal_1  thal_2  \
0     

In [None]:
# Scale numeric features
scaler = StandardScaler()
df[["age", "trestbps", "chol", "thalach", "oldpeak"]] = scaler.fit_transform(df[["age", "trestbps", "chol", "thalach", "oldpeak"]])

In [None]:
print(df)

          age  trestbps      chol   thalach   oldpeak  target  sex_0  sex_1  \
0    0.949794  0.764066 -0.261285  0.018826  1.084022       1      0      1   
1   -1.928548 -0.091401  0.067741  1.636979  2.118926       1      0      1   
2   -1.485726 -0.091401 -0.822564  0.980971  0.307844       1      1      0   
3    0.174856 -0.661712 -0.203222  1.243374 -0.209608       1      0      1   
4    0.285561 -0.661712  2.080602  0.587366 -0.382092       1      1      0   
..        ...       ...       ...       ...       ...     ...    ...    ...   
298  0.285561  0.478910 -0.106449 -1.161988 -0.727060       0      1      0   
299 -1.042904 -1.232023  0.338703 -0.768384  0.135360       0      0      1   
300  1.503322  0.707035 -1.035462 -0.374779  2.032684       0      0      1   
301  0.285561 -0.091401 -2.235438 -1.511859  0.135360       0      0      1   
302  0.285561 -0.091401 -0.203222  1.068439 -0.899544       0      1      0   

     cp_0  cp_1  ...  slope_2  ca_0  ca_1  ca_2  ca

In [None]:
# Save cleaned and transformed data to a new CSV file
df.to_csv("cleaned_heart_disease_data.csv", index=False)

In [None]:
df.head(5)

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.949794,0.764066,-0.261285,0.018826,1.084022,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,-1.928548,-0.091401,0.067741,1.636979,2.118926,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
2,-1.485726,-0.091401,-0.822564,0.980971,0.307844,1,1,0,0,1,...,1,1,0,0,0,0,0,0,1,0
3,0.174856,-0.661712,-0.203222,1.243374,-0.209608,1,0,1,0,1,...,1,1,0,0,0,0,0,0,1,0
4,0.285561,-0.661712,2.080602,0.587366,-0.382092,1,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0
