In [3]:
'''Data Discretization and Data Normalization. Use any suitable dataset (e.g. heart dataset https://www.kaggle.com/zhaoyingzhu/heartesv ). 
Perform following operations on given dataset suitable programming language. a) Find standard deviation, variance of every numerical attribute. b) Find covariance and perform Correlation analysis using Correlation coefficient. c) How many independent features are present in the given dataset? d) Can we identify unwanted features? e) Perform the data discretization using equi frequency binning method on age attribute f) Normalize RestBP, 
chol, and MaxHR attributes (considering above dataset) using min-max normalization, Z-score normalization, and decimal scaling normalization.'''
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\Asus\OneDrive\Desktop\Heart.csv")
print(df)

      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  target  
0         2   2     3       0  
1         0

In [21]:
#Standard Deviation and Variance
std_dev = df.std()
variance = df.var()
print("Standard Deviation:\n", std_dev)
print("\nVariance:\n", variance)

Standard Deviation:
 age          9.082101
sex          0.466011
cp           1.032052
trestbps    17.538143
chol        51.830751
fbs          0.356198
restecg      0.525860
thalach     22.905161
exang        0.469794
oldpeak      1.161075
slope        0.616226
ca           1.022606
thal         0.612277
target       0.498835
dtype: float64

Variance:
 age           82.484558
sex            0.217166
cp             1.065132
trestbps     307.586453
chol        2686.426748
fbs            0.126877
restecg        0.276528
thalach      524.646406
exang          0.220707
oldpeak        1.348095
slope          0.379735
ca             1.045724
thal           0.374883
target         0.248836
dtype: float64


In [23]:
#Covariance and Correlation
covariance = df.cov()
correlation = df.corr()
print("\nCovariance:\n", covariance)
print("\nCorrelation:\n", correlation)


Covariance:
                  age       sex        cp    trestbps         chol       fbs  \
age        82.484558 -0.416661 -0.643499   44.495902   100.585076  0.392433   
sex        -0.416661  0.217166 -0.023736   -0.463970    -4.780309  0.007475   
cp         -0.643499 -0.023736  1.065132    0.861714    -4.113774  0.034719   
trestbps   44.495902 -0.463970  0.861714  307.586453   111.967215  1.109042   
chol      100.585076 -4.780309 -4.113774  111.967215  2686.426748  0.245427   
fbs         0.392433  0.007475  0.034719    1.109042     0.245427  0.126877   
restecg    -0.555013 -0.014261  0.024108   -1.052324    -4.116703 -0.015769   
thalach   -82.903318 -0.469871  6.991618  -18.759131   -11.800494 -0.069897   
exang       0.413022  0.031014 -0.191168    0.557111     1.631991  0.004295   
oldpeak     2.214583  0.051993 -0.178821    3.934486     3.246794  0.002377   
slope      -0.944791 -0.008819  0.076137   -1.312832    -0.128964 -0.013147   
ca          2.566356  0.056357 -0.1910

In [27]:
#Number of Independent Features
num_features = df.shape[1]
print("\nNumber of independent features:", num_features)


Number of independent features: 14


In [35]:
#
numerical_columns = ['age', 'trestbps', 'chol', 'thalach']  
for col in numerical_columns:
    if col in df.columns:
        # Calculate mean
        mean = df[col].sum() / len(df[col])
        
        # Calculate variance
        variance = ((df[col] - mean) ** 2).sum() / len(df[col])
        
        # Calculate standard deviation
        std_dev = variance ** 0.5
        
        print(f"Column: {col}")
        print(f"Mean: {mean}")
        print(f"Variance: {variance}")
        print(f"Standard Deviation: {std_dev}\n")
    else:
        print(f"Column '{col}' not found in the dataset.")

Column: age
Mean: 54.43414634146342
Variance: 82.22615110053539
Standard Deviation: 9.067863645894516

Column: trestbps
Mean: 131.61170731707318
Variance: 306.53605806067816
Standard Deviation: 17.508171179785688

Column: chol
Mean: 246.0
Variance: 2659.190243902439
Standard Deviation: 51.56733698672483

Column: thalach
Mean: 149.11414634146342
Variance: 528.7469706127305
Standard Deviation: 22.994498703227485



In [5]:
# Z-Score Normalization
if 'chol' in df.columns:
    mean = df['chol'].sum() / len(df['chol'])
    std_dev = ((df['chol'] - mean) * 2).sum() * 0.5 / len(df['chol'])
    df['chol_zscore'] = (df['chol'] - mean) / std_dev
    print("Z-Score Normalized 'chol':\n", df['chol_zscore'].head())

Z-Score Normalized 'chol':
 0   -inf
1   -inf
2   -inf
3   -inf
4    inf
Name: chol_zscore, dtype: float64


In [7]:
# Decimal Scaling Normalization
if 'thalach' in df.columns:
    max_abs_value = df['thalach'].abs().max()
    scale = 10 ** (len(str(int(max_abs_value))))
    df['thalach_decimal'] = df['thalach'] / scale
    print("Decimal Scaling Normalized 'thalach':\n", df['thalach_decimal'].head())

Decimal Scaling Normalized 'thalach':
 0    0.168
1    0.155
2    0.125
3    0.161
4    0.106
Name: thalach_decimal, dtype: float64


In [9]:
# Min-Max Normalization
if 'trestbps' in df.columns:
    min_val = df['trestbps'].min()
    max_val = df['trestbps'].max()
    df['trestbps_minmax'] = (df['trestbps'] - min_val) / (max_val - min_val)
    print("Min-Max Normalized 'trestbps':\n", df['trestbps_minmax'].head())


Min-Max Normalized 'trestbps':
 0    0.292453
1    0.433962
2    0.481132
3    0.509434
4    0.415094
Name: trestbps_minmax, dtype: float64
