# Importing Libraries and Loading Dataset



In [1]:
# Importing necessary libraries
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
from datasets import load_dataset
from adjustText import adjust_text
import seaborn as sns
import re
from datetime import datetime
import plotly.express as px
%matplotlib inline

# loading the dataset
health = pd.read_csv(r"C:\Users\DELL\Desktop\Regonet_project\healthcare_dataset.csv")

# Creating a copy of the dataset
health_1 = health.copy()

health_1.sample(10, random_state=42)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
31641,mIchAEl thOrnTon mD,57,Male,O+,Diabetes,9/15/2023,Jason Hanson,Thornton-Roy,Medicare,3616.89845,339,Elective,10/2/2023,Aspirin,Inconclusive
9246,mattheW HUTcHiNsOn,51,Female,A+,Diabetes,10/7/2023,Jesse Gonzalez,Wilkerson-Lewis,Aetna,36970.07548,372,Emergency,10/14/2023,Penicillin,Abnormal
1583,RoNald paRK,20,Male,A+,Asthma,9/9/2019,Sarah Hernandez,Brown-Hughes,Blue Cross,44393.00135,148,Elective,10/8/2019,Penicillin,Inconclusive
36506,Jeff BroOkS,74,Female,B+,Obesity,9/14/2020,Cathy Sanchez,"Wilson, Alexander Wolf and",Aetna,27554.92371,135,Emergency,9/21/2020,Ibuprofen,Abnormal
11259,TAnya THoMPsOn,56,Male,AB-,Obesity,2/1/2023,Nancy Lee,"Winters, Blackburn Chandler and",Aetna,27466.31857,284,Emergency,2/7/2023,Paracetamol,Inconclusive
8972,EThan MItcHELL,46,Male,B+,Cancer,12/30/2021,Christina Hart,Rocha-Wagner,Medicare,4802.620714,175,Elective,1/27/2022,Paracetamol,Abnormal
36078,AMBer WRiGHt,44,Female,O+,Asthma,5/13/2023,Melissa Stephens,Friedman-Douglas,Blue Cross,40207.46322,336,Elective,6/9/2023,Paracetamol,Inconclusive
42659,mORGAn lAWreNce,46,Male,B-,Diabetes,6/20/2023,Paul Hansen,"Wells and Davila Cooper,",Medicare,30065.2606,212,Elective,7/16/2023,Ibuprofen,Abnormal
6545,MackEnZiE MAxwEll,64,Female,AB-,Arthritis,8/3/2019,Thomas Pratt,"and Alvarez, Cox Powers",Cigna,392.913548,113,Urgent,8/15/2019,Ibuprofen,Inconclusive
35448,Cindy ROGeRS,68,Male,B-,Cancer,3/15/2022,Calvin George,"Conley Jackson, and Hill",Medicare,20886.34215,274,Urgent,4/14/2022,Penicillin,Normal


# Dataset Inspection

In [2]:
health_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [3]:
# Dispaying columns
health_1.columns

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date',
       'Medication', 'Test Results'],
      dtype='object')

# Data Cleaning

## RemRemoving extra leading and trailing spaces from text-based columns 

In [4]:
# Removing extra leading and trailing spaces from text-based columns
for col in health_1.select_dtypes(include='object'):
    health_1[col] = health_1[col].map(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x,str) else x)

## Renaming columns to comply with Python Naming Convention

In [5]:
health_1.rename(columns={'Blood Type': 'Blood_Type','Medical Condition': 'Medical_Condition', 
                         'Date of Admission': 'Date_of_Admission', 'Insurance Provider': 'Insurance_Provider',
                         'Billing Amount': 'Billing_Amount','Room Number':'Room_Number',
                         'Admission Type':'Admission_Type', 'Discharge Date': 'Discharge_Date',
                         'Test Results':'Test_Results'
}, inplace=True)

## Checking the raw data in health dataset

In [6]:
for index, row in health_1[['Name', 'Age', 'Gender', 'Blood_Type', 'Medical_Condition',
       'Date_of_Admission', 'Doctor', 'Hospital', 'Insurance_Provider',
       'Billing_Amount', 'Room_Number', 'Admission_Type', 'Discharge_Date',
       'Medication', 'Test_Results']].head(20).iterrows():
    print(f"""Name: {repr(row['Name'])}, Age: {repr(row['Age'])}, Gender: {repr(row['Gender'])},
           Blood_Type: {repr(row['Blood_Type'])}, Medical_Condition: {repr(row['Medical_Condition'])},
           Date_of_Admission: {repr(row['Date_of_Admission'])}, Doctor: {repr(row['Doctor'])},
           Hospital: {repr(row['Hospital'])}, Insurance_Provider: {repr(row['Insurance_Provider'])},
           Billing_Amount: {repr(row['Billing_Amount'])}, Room_Number: {repr(row['Room_Number'])},
           Admission_Type: {repr(row['Admission_Type'])},Discharge_Date: {repr(row['Discharge_Date'])},
           Medication: {repr(row['Medication'])}, Test_Results: {repr(row['Test_Results'])}""")

Name: 'Bobby JacksOn', Age: 30, Gender: 'Male',
           Blood_Type: 'B-', Medical_Condition: 'Cancer',
           Date_of_Admission: '1/31/2024', Doctor: 'Matthew Smith',
           Hospital: 'Sons and Miller', Insurance_Provider: 'Blue Cross',
           Billing_Amount: 18856.28131, Room_Number: 328,
           Admission_Type: 'Urgent',Discharge_Date: '2/2/2024',
           Medication: 'Paracetamol', Test_Results: 'Normal'
Name: 'LesLie TErRy', Age: 62, Gender: 'Male',
           Blood_Type: 'A+', Medical_Condition: 'Obesity',
           Date_of_Admission: '8/20/2019', Doctor: 'Samantha Davies',
           Hospital: 'Kim Inc', Insurance_Provider: 'Medicare',
           Billing_Amount: 33643.32729, Room_Number: 265,
           Admission_Type: 'Emergency',Discharge_Date: '8/26/2019',
           Medication: 'Ibuprofen', Test_Results: 'Inconclusive'
Name: 'DaNnY sMitH', Age: 76, Gender: 'Female',
           Blood_Type: 'A-', Medical_Condition: 'Obesity',
           Date_of_Admission: '

In [7]:
health_1.columns

Index(['Name', 'Age', 'Gender', 'Blood_Type', 'Medical_Condition',
       'Date_of_Admission', 'Doctor', 'Hospital', 'Insurance_Provider',
       'Billing_Amount', 'Room_Number', 'Admission_Type', 'Discharge_Date',
       'Medication', 'Test_Results'],
      dtype='object')

## Converting the name column to a title case

In [8]:
# Converting the name coulmn to title case
health_1['Name'] = health_1['Name'].str.title()

health_1['Name'].head(20)

0           Bobby Jackson
1            Leslie Terry
2             Danny Smith
3            Andrew Watts
4           Adrienne Bell
5           Emily Johnson
6          Edward Edwards
7      Christina Martinez
8         Jasmine Aguilar
9        Christopher Berg
10       Michelle Daniels
11         Aaron Martinez
12          Connor Hansen
13           Robert Bauer
14           Brooke Brady
15     Ms. Natalie Gamble
16          Haley Perkins
17    Mrs. Jamie Campbell
18           Luke Burgess
19         Daniel Schmidt
Name: Name, dtype: object

## Converting the Date of Admission and Discharge Date to datatime format

In [9]:
# Converting the Date of Admission and Discharge Date to datetime format
health_1['Date_of_Admission'] = pd.to_datetime(health_1['Date_of_Admission'], errors='coerce').dt.strftime('%Y-%m-%d')
health_1['Discharge_Date'] = pd.to_datetime(health_1['Discharge_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
print(health_1[['Date_of_Admission','Discharge_Date']].head(20)) 

   Date_of_Admission Discharge_Date
0         2024-01-31     2024-02-02
1         2019-08-20     2019-08-26
2         2022-09-22     2022-10-07
3         2020-11-18     2020-12-18
4         2022-09-19     2022-10-09
5         2023-12-20     2023-12-24
6         2020-11-03     2020-11-15
7         2021-12-28     2022-01-07
8         2020-07-01     2020-07-14
9         2021-05-23     2021-06-22
10        2020-04-19     2020-04-22
11        2023-08-13     2023-09-05
12        2019-12-12     2019-12-28
13        2020-05-22     2020-06-19
14        2021-10-08     2021-10-13
15        2023-01-01     2023-01-11
16        2020-06-23     2020-07-14
17        2020-03-08     2020-04-02
18        2021-03-04     2021-03-14
19        2022-11-15     2022-11-22


##  Cleaning Hospital Column

In [10]:
# Printing the raw values on Company Name Column
for val in health_1['Hospital'].head(20):
    print(repr(val))

'Sons and Miller'
'Kim Inc'
'Cook PLC'
'Hernandez Rogers and Vang,'
'White-White'
'Nunez-Humphrey'
'Group Middleton'
'Powell Robinson and Valdez,'
'Sons Rich and'
'Padilla-Walker'
'Schaefer-Porter'
'Lyons-Blair'
'Powers Miller, and Flores'
'Rivera-Gutierrez'
'Morris-Arellano'
'Cline-Williams'
'Cervantes-Wells'
'Torres, and Harrison Jones'
'Houston PLC'
'Hammond Ltd'


In [11]:
# cleaning the Hospital column by removing commas and extra spaces
health_1['Hospital'] = (
    health_1['Hospital']
    .str.replace(',', '', regex=False)   # Remove commas
    .str.strip()                         # Remove leading/trailing spaces
)

health_1['Hospital'].head(20)

0                Sons and Miller
1                        Kim Inc
2                       Cook PLC
3      Hernandez Rogers and Vang
4                    White-White
5                 Nunez-Humphrey
6                Group Middleton
7     Powell Robinson and Valdez
8                  Sons Rich and
9                 Padilla-Walker
10               Schaefer-Porter
11                   Lyons-Blair
12      Powers Miller and Flores
13              Rivera-Gutierrez
14               Morris-Arellano
15                Cline-Williams
16               Cervantes-Wells
17     Torres and Harrison Jones
18                   Houston PLC
19                   Hammond Ltd
Name: Hospital, dtype: object

## Converting the Billing Amount Column to 2 Decimal Places

In [12]:
health_1['Billing_Amount'] = health_1['Billing_Amount'].round(2)
health_1['Billing_Amount'].head(10)

0    18856.28
1    33643.33
2    27955.10
3    37909.78
4    14238.32
5    48145.11
6    19580.87
7    45820.46
8    50119.22
9    19784.63
Name: Billing_Amount, dtype: float64

## Checking the Final Heathcare Dataset After Cleaning

In [13]:
health_1.sample(30, random_state=42)

Unnamed: 0,Name,Age,Gender,Blood_Type,Medical_Condition,Date_of_Admission,Doctor,Hospital,Insurance_Provider,Billing_Amount,Room_Number,Admission_Type,Discharge_Date,Medication,Test_Results
31641,Michael Thornton Md,57,Male,O+,Diabetes,2023-09-15,Jason Hanson,Thornton-Roy,Medicare,3616.9,339,Elective,2023-10-02,Aspirin,Inconclusive
9246,Matthew Hutchinson,51,Female,A+,Diabetes,2023-10-07,Jesse Gonzalez,Wilkerson-Lewis,Aetna,36970.08,372,Emergency,2023-10-14,Penicillin,Abnormal
1583,Ronald Park,20,Male,A+,Asthma,2019-09-09,Sarah Hernandez,Brown-Hughes,Blue Cross,44393.0,148,Elective,2019-10-08,Penicillin,Inconclusive
36506,Jeff Brooks,74,Female,B+,Obesity,2020-09-14,Cathy Sanchez,Wilson Alexander Wolf and,Aetna,27554.92,135,Emergency,2020-09-21,Ibuprofen,Abnormal
11259,Tanya Thompson,56,Male,AB-,Obesity,2023-02-01,Nancy Lee,Winters Blackburn Chandler and,Aetna,27466.32,284,Emergency,2023-02-07,Paracetamol,Inconclusive
8972,Ethan Mitchell,46,Male,B+,Cancer,2021-12-30,Christina Hart,Rocha-Wagner,Medicare,4802.62,175,Elective,2022-01-27,Paracetamol,Abnormal
36078,Amber Wright,44,Female,O+,Asthma,2023-05-13,Melissa Stephens,Friedman-Douglas,Blue Cross,40207.46,336,Elective,2023-06-09,Paracetamol,Inconclusive
42659,Morgan Lawrence,46,Male,B-,Diabetes,2023-06-20,Paul Hansen,Wells and Davila Cooper,Medicare,30065.26,212,Elective,2023-07-16,Ibuprofen,Abnormal
6545,Mackenzie Maxwell,64,Female,AB-,Arthritis,2019-08-03,Thomas Pratt,and Alvarez Cox Powers,Cigna,392.91,113,Urgent,2019-08-15,Ibuprofen,Inconclusive
35448,Cindy Rogers,68,Male,B-,Cancer,2022-03-15,Calvin George,Conley Jackson and Hill,Medicare,20886.34,274,Urgent,2022-04-14,Penicillin,Normal


In [14]:
health_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood_Type          55500 non-null  object 
 4   Medical_Condition   55500 non-null  object 
 5   Date_of_Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance_Provider  55500 non-null  object 
 9   Billing_Amount      55500 non-null  float64
 10  Room_Number         55500 non-null  int64  
 11  Admission_Type      55500 non-null  object 
 12  Discharge_Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test_Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4