In [100]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [101]:
health = pd.read_csv('healthcare_datasets.csv')

In [102]:
health.head()

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,84,Male,95,157,Hypertension
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,40,Male,163,285,
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,72,Male,NAN,168,
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,39,Male,118,288,Diabetes
4,044838d9-29a8-46da-8d74-18705c8df5d7,28,Female,128,229,Diabetes


In [103]:
# Condition "None" is imported as "NaN" in the pandas dataframe

In [104]:
health.shape

(3021, 6)

## Removing Duplicates

In [105]:
# total number of duplicates
no_of_duplicates = int(health.duplicated().sum())
no_of_duplicates

20

In [106]:
health = health.drop_duplicates()
health.shape

(3001, 6)

In [107]:
# removed 20 duplicate rows

## Handling Missing Values

In [108]:
# Patient_ID is not required to train the model
health = health.drop(columns = ['Patient_ID'])

In [109]:
health.head(15)

Unnamed: 0,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,84,Male,95,157,Hypertension
1,40,Male,163,285,
2,72,Male,NAN,168,
3,39,Male,118,288,Diabetes
4,28,Female,128,229,Diabetes
5,60,Male,91,156,
6,18,Female,117,119,
7,45,Female,101,170,Hypertension
8,77,Female,99,278,Hypertension
9,90,Male,120,133,


In [110]:
list(health['Condition'].unique())

['Hypertension', nan, 'Diabetes']

In [111]:
health['Condition'].fillna('No Condition', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  health['Condition'].fillna('No Condition', inplace = True)


In [112]:
# there are missing values with "NAN" which is not considered as null

In [113]:
health.replace("NAN", np.nan, inplace = True)

In [114]:
health.isnull().sum()

Age               2
Gender            4
Blood_Pressure    3
Cholesterol       2
Condition         0
dtype: int64

In [115]:
int(health.isnull().sum().sum())

11

In [116]:
#percentage_missing = (health.isnull().sum().sum() / health.shape[0]) * 100
#float(percentage_missing)

In [117]:
# we need to impute the missing values

## Data Splitting

In [118]:
x = health.drop(columns = ['Condition'])
y = health['Condition']

In [119]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 21)

In [120]:
x_train.shape

(2400, 4)

In [121]:
x_test.shape

(601, 4)

In [122]:
num_columns = ['Age', 'Blood_Pressure', 'Cholesterol']
cat_columns = ['Gender']

In [123]:
#imputing the numeric columns with median
# imputing the categorical columns with mode

In [124]:
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy = 'median')
x_train[num_columns] = num_imputer.fit_transform(x_train[num_columns])
x_test[num_columns] = num_imputer.transform(x_test[num_columns])

cat_imputer = SimpleImputer(strategy = 'most_frequent')
x_train[cat_columns] = cat_imputer.fit_transform(x_train[cat_columns])
x_test[cat_columns] = cat_imputer.transform(x_test[cat_columns])

In [125]:
x_train.shape

(2400, 4)

In [126]:
#check for missing values
x_train.isnull().sum()

Age               0
Gender            0
Blood_Pressure    0
Cholesterol       0
dtype: int64

In [127]:
x_test.isnull().sum()

Age               0
Gender            0
Blood_Pressure    0
Cholesterol       0
dtype: int64

## Handling Outliers

In [128]:
# handling outliers using IQR method
# calculating the IQR bounds
for col in num_columns:
    q1 = x_train[col].quantile(0.25)
    q3 = x_train[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    # capping the outliers in the training set
    x_train[col] = np.clip(x_train[col], lower_bound, upper_bound)
    # capping the outliers in the test set
    x_test[col] = np.clip(x_test[col], lower_bound, upper_bound)

## Data Transformation

In [129]:
# checking skweness of the numeric columns
for col in num_columns:
    print(f"{col}: skewness = {x_train[col].skew()}")

Age: skewness = 0.005263680140173695
Blood_Pressure: skewness = -0.03304568058052449
Cholesterol: skewness = 0.0766066698333047


In [130]:
# skewness is close to 0
# so there is no need to transform the data

## Data Scaling

In [131]:
# we will scale the data now using StandardScaler

In [132]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train[num_columns] = scaler.fit_transform(x_train[num_columns])
x_test[num_columns] = scaler.transform(x_test[num_columns])

In [133]:
x_train.head(10)

Unnamed: 0,Age,Gender,Blood_Pressure,Cholesterol
157,-0.628928,Female,-1.40643,0.830976
1905,-1.154544,Female,1.441341,1.418183
543,1.234621,Female,0.926684,1.176392
1492,0.709005,Female,0.446337,-1.500579
804,-1.584594,Male,1.647204,1.280017
1063,0.804571,Female,-0.582978,-0.308895
4,-1.250111,Female,-0.102631,0.537373
2005,-1.727944,Female,-0.72022,1.193663
2256,1.282404,Female,0.343405,-1.586933
1186,1.616887,Male,-1.646603,1.262746


In [134]:
x_test.head(10)

Unnamed: 0,Age,Gender,Blood_Pressure,Cholesterol
485,-0.390011,Female,0.137542,-1.06881
924,0.804571,Female,-0.754531,1.487266
2356,1.473538,Male,1.304099,0.174687
1838,-1.584594,Female,0.755131,-1.29333
2847,0.087822,Female,0.583579,1.159121
331,-0.676711,Female,1.269788,1.452725
1682,-0.867844,Female,-1.166257,-0.654311
1942,-0.963411,Male,-0.205563,1.487266
1910,1.377971,Male,0.034611,-1.206976
510,0.040038,Male,-1.612293,-0.740665


## Encoding

In [None]:
# using one_hot encoding

In [None]:
x_train = pd.get_dummies(x_train, columns = cat_columns, drop_first = True)

In [136]:
x_test = pd.get_dummies(x_test, columns = cat_columns, drop_first = True)

In [137]:
x_train.shape

(2400, 4)

In [138]:
x_test.shape

(601, 4)

In [None]:
# we did not loose any important columns during data preprocessing