In [45]:
#Import Libaries
import os
import pandas as pd
import numpy as np

In [46]:
#Load Dataset
df = pd.read_csv("../data/raw/healthcare_dataset.csv")

print("Shape:", df.shape)
df.head()

Shape: (5000, 26)


Unnamed: 0,age,gender,admission_type,insurance_type,bmi,systolic_bp,diastolic_bp,blood_glucose,cholesterol,heart_rate,...,num_conditions,num_procedures,num_medications,icu_admission,risk_score,risk_category,length_of_stay,total_claim_cost,payer_coverage,patient_feedback
0,69,Male,Elective,Government,24.198234,152.149985,91.246475,165.344609,210.342343,88.465785,...,3,0,3,0,12,Medium,10.54815,21709.929543,0.6,Good treatment but waiting time was long.
1,32,Female,Elective,Government,26.123209,156.112911,74.901063,138.279525,225.434062,65.557307,...,2,0,6,0,7,Medium,7.630583,15452.94621,0.6,Unsatisfied with hospital service.
2,89,Male,Elective,Government,27.658751,94.454724,73.024929,99.496088,153.370278,98.588967,...,1,1,5,0,4,Low,4.674591,9315.500867,0.6,Average experience overall.
3,78,Male,Emergency,Private,28.812254,139.520216,85.082641,70.0,,68.357545,...,0,1,4,0,4,Low,9.685285,20720.812733,0.8,Average experience overall.
4,38,Male,Emergency,Private,26.925373,103.863573,70.255702,70.0,167.883664,73.221997,...,1,0,3,0,4,Low,9.780789,18763.532501,0.8,Unsatisfied with hospital service.


In [47]:
#Check for missing values
missing_percent = df.isnull().mean() * 100
print("Missing %:\n")
print(missing_percent[missing_percent > 0])

Missing %:

bmi              14.58
blood_glucose    14.84
cholesterol      15.04
hemoglobin       15.26
creatinine       14.72
dtype: float64


In [48]:
#Separate numerical and categorical columns
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = df.select_dtypes(include=["object"]).columns

print("Numeric Columns:", len(numeric_cols))
print("Categorical Columns:", len(categorical_cols))

Numeric Columns: 20
Categorical Columns: 6


In [49]:
#Handle missing values
#For numeric columns, we can fill missing values with the median
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())
    #For categorical columns, we can fill missing values with the mode
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [50]:
#confirm no missing values remain
print("Remaining Missing:"),
df.isnull().sum()

Remaining Missing:


age                 0
gender              0
admission_type      0
insurance_type      0
bmi                 0
systolic_bp         0
diastolic_bp        0
blood_glucose       0
cholesterol         0
heart_rate          0
hemoglobin          0
creatinine          0
diabetes            0
hypertension        0
heart_disease       0
smoking_status      0
num_conditions      0
num_procedures      0
num_medications     0
icu_admission       0
risk_score          0
risk_category       0
length_of_stay      0
total_claim_cost    0
payer_coverage      0
patient_feedback    0
dtype: int64

In [51]:
#Outlier Capping (IQR method)
def cap_outliers(data, cols):
    for col in cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        
        data[col] = np.where(data[col] < lower, lower,
                      np.where(data[col] > upper, upper,
                               data[col]))
    return data

cols_to_cap = [col for col in numeric_cols if col not in ["length_of_stay"]]

df = cap_outliers(df, cols_to_cap)

print("Outlier capping completed.")

Outlier capping completed.


In [52]:
#Feature Engineering: Create new features
df["pulse_pressure"] = df["systolic_bp"] - df["diastolic_bp"]

In [53]:
#Clinical severity score (simple sum of abnormal values)
df["clinical_severity"] = (
    (df["systolic_bp"] > 150).astype(int) +
    (df["blood_glucose"] > 180).astype(int) +
    (df["creatinine"] > 1.4).astype(int)
)

In [54]:
#Final validation
print("Final Shape:", df.shape)
df.describe()

Final Shape: (5000, 28)


Unnamed: 0,age,bmi,systolic_bp,diastolic_bp,blood_glucose,cholesterol,heart_rate,hemoglobin,creatinine,diabetes,...,num_conditions,num_procedures,num_medications,icu_admission,risk_score,length_of_stay,total_claim_cost,payer_coverage,pulse_pressure,clinical_severity
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,53.299,27.147156,129.942771,80.002693,132.246074,198.703958,80.592537,13.504797,1.009944,0.2814,...,1.2499,1.4267,4.9612,0.0,6.2876,9.32361,20618.116692,0.6539,49.940078,0.3794
std,20.646851,4.527984,19.604935,11.670394,41.839578,35.59118,14.817683,1.323883,0.262766,0.449727,...,0.927347,1.095453,2.182716,0.0,3.828762,4.65667,9186.188903,0.175929,22.918818,0.578206
min,18.0,16.0,90.0,50.0,70.0,120.0,50.0,10.378678,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,500.0,0.3,-21.964775,0.0
25%,36.0,24.335094,115.892906,71.85334,102.361366,176.365058,70.125595,12.733692,0.843332,0.0,...,1.0,1.0,3.0,0.0,4.0,6.184493,14096.883475,0.6,33.858757,0.0
50%,53.0,27.151418,129.766795,80.090015,129.980502,199.10721,80.477799,13.50852,1.008332,0.0,...,1.0,1.0,5.0,0.0,6.0,8.532612,19377.151987,0.8,49.215468,0.0
75%,71.0,29.896743,143.56541,87.897914,158.698601,220.229083,90.912797,14.303702,1.166279,1.0,...,2.0,2.0,6.0,0.0,8.0,11.130218,25116.859285,0.8,65.182431,1.0
max,89.0,38.239218,185.074166,111.964775,243.204454,286.02512,122.093599,16.658717,1.6507,1.0,...,3.5,3.5,10.5,0.0,14.0,27.431176,41646.823,0.8,130.693228,3.0


In [55]:
os.makedirs("../data/processed", exist_ok=True)

df.to_csv("../data/processed/cleaned_dataset.csv", index=False)

print("Cleaned dataset saved successfully.")

Cleaned dataset saved successfully.
