# Label Encoding, One Hot Encoding, Scaling

In [1]:
# Previous
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Loading the dataset
df = pd.read_csv('../dataset/heart disease classification dataset.csv')

df1 = df.copy()

In [2]:
# Dropping Null Values
df1_dropped = df1.dropna()
print("After dropping null values, shape:", df1_dropped.shape)

After dropping null values, shape: (293, 15)


In [3]:
# Finding number of outliers (for 'oldpeak' column)
Q1 = df1['oldpeak'].quantile(0.25)
Q3 = df1['oldpeak'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df1[(df1['oldpeak'] < lower_bound) | (df1['oldpeak'] > upper_bound)]
print(f"Number of outliers in 'oldpeak': {len(outliers)}")

Number of outliers in 'oldpeak': 5


In [4]:
# Removing Outliers
df1_no_outliers = df1[(df1['oldpeak'] >= lower_bound) & (df1['oldpeak'] <= upper_bound)]

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [6]:
print("Label Encoding:")
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Apply label encoding to the 'sex' column
df['sex'] = label_encoder.fit_transform(df['sex'])
print("After label encoding 'sex' column:")
print(df['sex'].head())  # Display the first few values of the encoded column


Label Encoding:
After label encoding 'sex' column:
0    1
1    1
2    0
3    1
4    0
Name: sex, dtype: int64


In [7]:
print("One Hot Encoding:")
# Perform one-hot encoding on the 'target' column
# The drop_first=True option drops the first category to avoid multicollinearity
df_encoded = pd.get_dummies(df, columns=['target'], drop_first=True)
print("After one-hot encoding 'target' column:")
print(df_encoded.head())  # Display the first few rows of the encoded DataFrame


One Hot Encoding:
After one-hot encoding 'target' column:
   Unnamed: 0  age  sex  cp  trestbps   chol  fbs  restecg  thalach  exang  \
0           0   63    1   3     145.0  233.0    1        0    150.0      0   
1           1   37    1   2     130.0  250.0    0        1    187.0      0   
2           2   41    0   1     130.0  204.0    0        0    172.0      0   
3           3   56    1   1     120.0  236.0    0        1    178.0      0   
4           4   57    0   0       NaN  354.0    0        1    163.0      1   

   oldpeak  slope  ca  thal  target_yes  
0      2.3      0   0     1        True  
1      3.5      0   0     2        True  
2      1.4      2   0     2        True  
3      0.8      2   0     2        True  
4      0.6      2   0     2        True  


In [9]:
print("Scaling:")
# Select only numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Create a MinMaxScaler instance
min_max_scaler = MinMaxScaler()

# Apply MinMax scaling only to numerical columns
df[numerical_cols] = min_max_scaler.fit_transform(df[numerical_cols])

print("After scaling numerical columns:")
print(df.head())  # Display the first few rows of the scaled DataFrame


Scaling:
After scaling numerical columns:
   Unnamed: 0       age  sex        cp  trestbps      chol  fbs  restecg  \
0    0.000000  0.708333  1.0  1.000000  0.481132  0.244292  1.0      0.0   
1    0.003311  0.166667  1.0  0.666667  0.339623  0.283105  0.0      0.5   
2    0.006623  0.250000  0.0  0.333333  0.339623  0.178082  0.0      0.0   
3    0.009934  0.562500  1.0  0.333333  0.245283  0.251142  0.0      0.5   
4    0.013245  0.583333  0.0  0.000000       NaN  0.520548  0.0      0.5   

    thalach  exang   oldpeak  slope   ca      thal target  
0  0.603053    0.0  0.370968    0.0  0.0  0.333333    yes  
1  0.885496    0.0  0.564516    0.0  0.0  0.666667    yes  
2  0.770992    0.0  0.225806    1.0  0.0  0.666667    yes  
3  0.816794    0.0  0.129032    1.0  0.0  0.666667    yes  
4  0.702290    1.0  0.096774    1.0  0.0  0.666667    yes  


In [10]:
df

Unnamed: 0.1,Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.000000,0.708333,1.0,1.000000,0.481132,0.244292,1.0,0.0,0.603053,0.0,0.370968,0.0,0.00,0.333333,yes
1,0.003311,0.166667,1.0,0.666667,0.339623,0.283105,0.0,0.5,0.885496,0.0,0.564516,0.0,0.00,0.666667,yes
2,0.006623,0.250000,0.0,0.333333,0.339623,0.178082,0.0,0.0,0.770992,0.0,0.225806,1.0,0.00,0.666667,yes
3,0.009934,0.562500,1.0,0.333333,0.245283,0.251142,0.0,0.5,0.816794,0.0,0.129032,1.0,0.00,0.666667,yes
4,0.013245,0.583333,0.0,0.000000,,0.520548,0.0,0.5,0.702290,1.0,0.096774,1.0,0.00,0.666667,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.986755,0.583333,0.0,0.000000,0.433962,0.262557,0.0,0.5,0.396947,1.0,0.032258,0.5,0.00,1.000000,no
299,0.990066,0.333333,1.0,1.000000,0.150943,0.315068,0.0,0.5,0.465649,0.0,0.193548,0.5,0.00,1.000000,no
300,0.993377,0.812500,1.0,0.000000,0.471698,0.152968,1.0,0.5,0.534351,0.0,0.548387,0.5,0.50,1.000000,no
301,0.996689,0.583333,1.0,0.000000,,0.011416,0.0,0.5,0.335878,1.0,0.193548,0.5,0.25,1.000000,no
