In [116]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from tensorboard.notebook import display

In [None]:
df = pd.read_csv('Titanic-Dataset.csv')
df.head()

In [None]:
df.duplicated().sum()

##### Handling missing values

In [None]:
df['Age'].isna().sum()

In [None]:
df.nunique()

In [None]:
plt.figure(figsize=(8,7))
sns.histplot(data=df,x=df['Age'],kde=True,bins=40)
plt.title('Age distribution of Passengers')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
# Age is right skewed, thus we will use median for filling missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
sns.histplot(data=df,x=df['Age'],kde=True,bins=40)

In [None]:
# Distribution of Embarked column
sns.countplot(data=df,x=df['Embarked'])
plt.title('Distribution of Embarked column')
plt.xlabel('Embarked')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
sns.countplot(data=df,x=df['Embarked'])

In [None]:
df.drop(columns=['Cabin'],inplace=True)

In [None]:
df.isna().sum()

##### Encoding

In [None]:
df = pd.read_csv('heart.csv')

In [None]:
categorical = []
numerical = []
for c in df.columns:
    if df[c].dtype == 'object':
        categorical.append(c)
    else:
        numerical.append(c)
categorical,numerical

In [None]:
# Categoricat feature exploration
for c in categorical:
    df[c].value_counts().plot(kind='bar')
    plt.title(f'Value counts for {c}')
    plt.xlabel(c)
    plt.ylabel('Counts')
    plt.grid()
    plt.tight_layout()
    plt.show()


In [None]:
# Label Encoding for binary categories(Sex,ExerciseAngina)
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df

In [None]:
# OneHot for the others
df_encoded = pd.get_dummies(df,columns=['ST_Slope','ChestPainType', 'RestingECG'],dtype=int)
df_encoded

##### Normalization

In [100]:
target = 'HeartDisease'
X = df_encoded.drop(columns=[target])
y = df_encoded[target]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [111]:
# Standard Scalling
scaled_sd = StandardScaler()
X_train_std = scaled_sd.fit_transform(X_train) # mean and std will only be calculated from train dataset
X_test_std = scaled_sd.transform(X_test) # mean and std will only be calculated from test dataset
X_test_std_df = pd.DataFrame(X_train_std,columns=X_train.columns,index=X_train.index)

In [112]:
display(X_test_std_df)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
155,0.239292,0.537019,1.183802,1.307314,1.877849,0.551672,1.182037,1.900458,-0.254981,0.962897,-0.851064,0.905597,-0.480384,-0.530275,-0.213524,-0.505445,0.816002,-0.494088
362,0.239292,0.537019,1.183802,-1.878000,-0.532524,-1.486343,-0.845997,-0.834739,-0.254981,0.962897,-0.851064,-1.104244,-0.480384,1.885813,-0.213524,-0.505445,-1.225487,2.023931
869,0.558968,0.537019,0.913811,0.096522,1.877849,0.831400,-0.845997,0.624033,-0.254981,-1.038533,1.174999,-1.104244,-0.480384,1.885813,-0.213524,-0.505445,0.816002,-0.494088
101,-0.293501,0.537019,-0.166155,-0.210833,-0.532524,-1.446382,-0.845997,-0.834739,-0.254981,-1.038533,1.174999,0.905597,-0.480384,-0.530275,-0.213524,-0.505445,0.816002,-0.494088
199,0.345851,-1.862131,-0.166155,0.990645,-0.532524,-1.526304,-0.845997,0.076994,-0.254981,0.962897,-0.851064,-1.104244,-0.480384,-0.530275,4.683304,-0.505445,0.816002,-0.494088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.613176,-1.862131,-0.706137,0.487701,-0.532524,-1.046771,-0.845997,-0.834739,-0.254981,-1.038533,1.174999,0.905597,-0.480384,-0.530275,-0.213524,-0.505445,-1.225487,2.023931
270,-0.932852,0.537019,-0.706137,0.217601,-0.532524,0.152061,-0.845997,-0.834739,-0.254981,-1.038533,1.174999,0.905597,-0.480384,-0.530275,-0.213524,-0.505445,0.816002,-0.494088
860,0.665526,0.537019,-0.166155,0.478387,-0.532524,0.311906,1.182037,0.441687,-0.254981,-1.038533,1.174999,0.905597,-0.480384,-0.530275,-0.213524,-0.505445,0.816002,-0.494088
435,0.665526,0.537019,1.021807,-1.878000,-0.532524,-0.727083,1.182037,-0.834739,-0.254981,-1.038533,1.174999,0.905597,-0.480384,-0.530275,-0.213524,-0.505445,-1.225487,2.023931


In [113]:
# MinMax Scaling
min_max = MinMaxScaler()
X_train_min_max = min_max.fit_transform(X_train)
X_test_min_max = min_max.transform(X_test)
X_train_min_max_df = pd.DataFrame(X_train_min_max,columns=X_train.columns,index=X_train.index)

In [114]:
X_train_min_max_df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
155,0.562500,1.0,0.775,0.567164,1.0,0.674419,1.0,0.636364,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
362,0.562500,1.0,0.775,0.000000,0.0,0.279070,0.0,0.295455,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
869,0.625000,1.0,0.750,0.351575,1.0,0.728682,0.0,0.477273,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
101,0.458333,1.0,0.650,0.296849,0.0,0.286822,0.0,0.295455,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
199,0.583333,0.0,0.650,0.510779,0.0,0.271318,0.0,0.409091,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.395833,0.0,0.600,0.421227,0.0,0.364341,0.0,0.295455,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
270,0.333333,1.0,0.600,0.373134,0.0,0.596899,0.0,0.295455,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
860,0.645833,1.0,0.650,0.419569,0.0,0.627907,1.0,0.454545,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
435,0.645833,1.0,0.760,0.000000,0.0,0.426357,1.0,0.295455,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [119]:
# Robust Scaler
rs = RobustScaler()
X_train_rs = rs.fit_transform(X_train)
X_test_rs = rs.transform(X_test)
X_train_rs_df = pd.DataFrame(X_train_rs,columns=X_train.columns,index = X_train.index)

In [120]:
X_train_rs_df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
155,0.153846,0.0,1.25,1.307479,1.0,0.324324,1.0,1.500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
362,0.153846,0.0,1.25,-2.481994,0.0,-1.054054,0.0,-0.375,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,-1.0,1.0
869,0.384615,0.0,1.00,-0.132964,1.0,0.513514,0.0,0.625,0.0,-1.0,1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0
101,-0.230769,0.0,0.00,-0.498615,0.0,-1.027027,0.0,-0.375,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199,0.230769,-1.0,0.00,0.930748,0.0,-1.081081,0.0,0.250,0.0,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.461538,-1.0,-0.50,0.332410,0.0,-0.756757,0.0,-0.375,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0
270,-0.692308,0.0,-0.50,0.011080,0.0,0.054054,0.0,-0.375,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.461538,0.0,0.00,0.321330,0.0,0.162162,1.0,0.500,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,0.461538,0.0,1.10,-2.481994,0.0,-0.540541,1.0,-0.375,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0
