## 1. Importing Required Libraries

In [10]:
from google.colab import drive
drive.mount('/content/drive')

from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. Loading the Dataset

In [3]:
filePath = '/content/drive/My Drive/DataSets/healthcare_dataset.csv'
data = pd.read_csv(filePath)
data.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


## 3. Data Cleaning

## 3.1. Handling Inconsistent Casing

In [4]:
data['Name'] = data['Name'].str.title()
data['Doctor'] = data['Doctor'].str.title()
data['Hospital'] = data['Hospital'].str.title()

print(data[['Name', 'Doctor', 'Hospital']].head())

            Name            Doctor                    Hospital
0  Bobby Jackson     Matthew Smith             Sons And Miller
1   Leslie Terry   Samantha Davies                     Kim Inc
2    Danny Smith  Tiffany Mitchell                    Cook Plc
3   Andrew Watts       Kevin Wells  Hernandez Rogers And Vang,
4  Adrienne Bell    Kathleen Hanna                 White-White


## 3.2. Convert Date Columns to datetime Format

In [5]:
data['Date of Admission'] = pd.to_datetime(data['Date of Admission'])
data['Discharge Date'] = pd.to_datetime(data['Discharge Date'])

print(data[['Date of Admission', 'Discharge Date']].dtypes)

Date of Admission    datetime64[ns]
Discharge Date       datetime64[ns]
dtype: object


## 4. Feature Engineering

## 4.1. Calculate Hospital Stay Duration

In [6]:
data['Stay Duration'] = (data['Discharge Date'] - data['Date of Admission']).dt.days

print(data[['Date of Admission', 'Discharge Date', 'Stay Duration']].head())

  Date of Admission Discharge Date  Stay Duration
0        2024-01-31     2024-02-02              2
1        2019-08-20     2019-08-26              6
2        2022-09-22     2022-10-07             15
3        2020-11-18     2020-12-18             30
4        2022-09-19     2022-10-09             20


## 4.2. Creating Age Groups

In [7]:
ageBins = [0, 18, 35, 50, 65, 100]
ageLabels = ['0-18', '19-35', '36-50', '51-65', '66+']

data['Age Group'] = pd.cut(data['Age'], bins = ageBins, labels = ageLabels, right = False)

print(data[['Age', 'Age Group']].head())

   Age Age Group
0   30     19-35
1   62     51-65
2   76       66+
3   28     19-35
4   43     36-50


## 5. Handling Categorical Data

## 5.1. Encoding Categorical Variables

In [8]:
dataEncoded = pd.get_dummies(data, columns=['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Test Results'])

print(dataEncoded.head())

            Name  Age Date of Admission            Doctor  \
0  Bobby Jackson   30        2024-01-31     Matthew Smith   
1   Leslie Terry   62        2019-08-20   Samantha Davies   
2    Danny Smith   76        2022-09-22  Tiffany Mitchell   
3   Andrew Watts   28        2020-11-18       Kevin Wells   
4  Adrienne Bell   43        2022-09-19    Kathleen Hanna   

                     Hospital Insurance Provider  Billing Amount  Room Number  \
0             Sons And Miller         Blue Cross    18856.281306          328   
1                     Kim Inc           Medicare    33643.327287          265   
2                    Cook Plc              Aetna    27955.096079          205   
3  Hernandez Rogers And Vang,           Medicare    37909.782410          450   
4                 White-White              Aetna    14238.317814          458   

  Discharge Date   Medication  ...  Medical Condition_Cancer  \
0     2024-02-02  Paracetamol  ...                      True   
1     2019-08-26  

## 6. Handling Missing Data

In [9]:
print(data.isnull().sum())

dataCleaned = data.dropna()

dataFilled = data.fillna(method = 'ffill')

Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
Stay Duration         0
Age Group             0
dtype: int64


  dataFilled = data.fillna(method = 'ffill')


## 7. Scaling Numerical Data

In [11]:
scaler = MinMaxScaler()

data['Billing Amount Scaled'] = scaler.fit_transform(data[['Billing Amount']])

print(data[['Billing Amount', 'Billing Amount Scaled']].head())

   Billing Amount  Billing Amount Scaled
0    18856.281306               0.380933
1    33643.327287               0.650904
2    27955.096079               0.547053
3    37909.782410               0.728798
4    14238.317814               0.296622


## 8. Handling Duplicate Data

In [12]:
print(data.duplicated().sum())

data = data.drop_duplicates()

534


## Final Processed Data

In [13]:
data.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results,Stay Duration,Age Group,Billing Amount Scaled
0,Bobby Jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons And Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal,2,19-35,0.380933
1,Leslie Terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive,6,51-65,0.650904
2,Danny Smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook Plc,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal,15,66+,0.547053
3,Andrew Watts,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers And Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal,30,19-35,0.728798
4,Adrienne Bell,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal,20,36-50,0.296622
