In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Load the dataset
df = pd.read_csv('heart.csv')

# Identify numeric columns for Z-score calculation
numeric_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

# Calculate Z-scores
z_scores = np.abs(zscore(df[numeric_cols]))

# Create a mask for rows where all Z-scores are less than 3
mask = (z_scores < 3).all(axis=1)

# Apply the mask to filter out outliers
df_clean = df[mask].reset_index(drop=True)

# Display the shapes of the original and cleaned datasets
print("Original dataset shape:", df.shape)
print("Cleaned dataset shape:", df_clean.shape)

# Preview the cleaned dataset
print(df_clean.head())



Original dataset shape: (918, 12)
Cleaned dataset shape: (899, 12)
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [2]:
z_scores

array([[1.4331398 , 0.41090889, 0.82507026, 1.38292822, 0.83243239],
       [0.47848359, 1.49175234, 0.17196105, 0.75415714, 0.10566353],
       [1.75135854, 0.12951283, 0.7701878 , 1.52513802, 0.83243239],
       ...,
       [0.37009972, 0.12951283, 0.62016778, 0.85706875, 0.29328271],
       [0.37009972, 0.12951283, 0.34027522, 1.4615246 , 0.83243239],
       [1.64528563, 0.30282455, 0.21769643, 1.42222641, 0.83243239]],
      shape=(918, 5))

In [3]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('heart.csv')


# Identify numeric columns for Z-score calculation
numeric_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

# Step 1: Manually calculate the Z-scores
# Calculate the mean and standard deviation for each numeric column
mean = df[numeric_cols].mean()
std_dev = df[numeric_cols].std()

# Z-score formula: (value - mean) / std_dev
z_scores = (df[numeric_cols] - mean) / std_dev

# Step 2: Create a mask to remove rows with Z-scores greater than 3 or less than -3
mask = (np.abs(z_scores) < 3).all(axis=1)

# Step 3: Filter the DataFrame using the mask
df_clean = df[mask].reset_index(drop=True)

# Step 4: Show before and after
print("Original shape:", df.shape)
print("Cleaned shape:", df_clean.shape)
print(df_clean.head())


Original shape: (918, 12)
Cleaned shape: (899, 12)
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [6]:
df[numeric_cols]

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,40,140,289,172,0.0
1,49,160,180,156,1.0
2,37,130,283,98,0.0
3,48,138,214,108,1.5
4,54,150,195,122,0.0
...,...,...,...,...,...
913,45,110,264,132,1.2
914,68,144,193,141,3.4
915,57,130,131,115,1.2
916,57,130,236,174,0.0


In [5]:
mean

Age             53.510893
RestingBP      132.396514
Cholesterol    198.799564
MaxHR          136.809368
Oldpeak          0.887364
dtype: float64