In [1]:
# @title
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Path-Tracing Tools for Line/Curve-Drawing
import matplotlib.path as mpath

# Tools for Text Imposition
from matplotlib.text import Text

# Line/Curve-Styling Tools
import matplotlib.lines as mlines

# Specialized Line-Drawing Utility
from matplotlib.lines import Line2D

# 2D Polygon-Drawing Tools
import matplotlib.patches as mpatches

# Grid Structuring Tools
import matplotlib.gridspec as gridspec

# 3D Plotting Utilities
from mpl_toolkits.mplot3d import Axes3D

# Tick Locating/Formatting Tools
from matplotlib.ticker import FuncFormatter

# Specialized Polygon Imposition Tools
from matplotlib.patches import Rectangle, Polygon

# Collection of Prebuilt Patch Objects
from matplotlib.collections import PatchCollection

In [2]:
DATAPATH = "healthcare_dataset.csv"
dataset = pd.read_csv(DATAPATH)


In [3]:
FEATURES = dataset.columns.tolist()

FEATURES

['Name',
 'Age',
 'Gender',
 'Blood Type',
 'Medical Condition',
 'Date of Admission',
 'Doctor',
 'Hospital',
 'Insurance Provider',
 'Billing Amount',
 'Room Number',
 'Admission Type',
 'Discharge Date',
 'Medication',
 'Test Results']

In [4]:
REFPATH = "healthcare_reference.csv"

DESCRIPTORS = [
    "Name of the patient associated with the healthcare record",
    "Age of the patient at the time of admission, expressed in years",
    "Gender of the patient, either 'Male' or 'Female'",
    "Patient's blood type, which can be one of the common blood types (e.g., 'A+', 'O-', etc.)",
    "Specifies the primary medical condition or diagnosis associated with the patient, such as 'Diabetes', 'Hypertension', 'Asthma', and more",
    "Date on which the patient was admitted to the healthcare facility",
    "The name of the doctor responsible for the patient's care during their admission",
    "Identifies the healthcare facility or hospital where the patient was admitted",
    "Indicates the patient's insurance provider, which can be one of several options, including 'Aetna', 'Blue Cross', 'Cigna', 'UnitedHealthcare', and 'Medicare'",
    "Amount of money billed for the patient's healthcare services during their admission. This is expressed as a floating-point number",
    "The room number where the patient was accommodated during their admission",
    "Specifies the type of admission, which can be 'Emergency', 'Elective', or 'Urgent', reflecting the circumstances of the admission",
    "The date on which the patient was discharged from the healthcare facility, based on the admission date and a random number of days within a realistic range",
    "Identifies a medication prescribed or administered to the patient during their admission. Examples include 'Aspirin', 'Ibuprofen', 'Penicillin', 'Paracetamol', and 'Lipitor'",
    "Describes the results of a medical test conducted during the patient's admission. Possible values include 'Normal', 'Abnormal', or 'Inconclusive', indicating the outcome of the test",
]

In [5]:
# @title
def operate_data_dictionary(features, descriptors, method="set", refpath=None):
  """ Operational function to work in creating or getting data dictionary. """
  if method == "set":
    # Produce dictionary-wrapped key-value associations of feature summaries
    data_dictionary = dict(zip(FEATURES, DESCRIPTORS))
    # Convert data dictionary to cleaner reference table
    reference = pd.DataFrame(data_dictionary, index=[0])
    # Save reference table for future access
    if refpath is not None and type(refpath) == str:
      reference.to_csv(refpath, index=False)
  if method == "get":
    # Get reference table from saved data dictionary
    if refpath is not None and type(refpath) == str:
      return pd.read_csv(refpath)
    else:
      raise TypeError("Saved file for data dictionary not found.")

def encode_categorical_feature(dataset, feature, encoding="label"):
  """ Custom function to encode categorical features using label-schema. """
  # Instantiate encoder architecture
  if encoding == "label":
    encoder = LabelEncoder()
  # Transform dataset feature using labeling schema (performs in-place)
  dataset[feature] = encoder.fit_transform(dataset[feature])
  # Get fitted encoder (just in case)
  return encoder

In [6]:
# Set data dictionary as reference table
operate_data_dictionary(features=FEATURES,
                        descriptors=DESCRIPTORS,
                        method="set",
                        refpath=REFPATH)

In [7]:
# Get data dictionary as reference table
reference = operate_data_dictionary(features=FEATURES,
                                    descriptors=DESCRIPTORS,
                                    method="get",
                                    refpath=REFPATH)

In [8]:
reference.T

Unnamed: 0,0
Name,Name of the patient associated with the health...
Age,"Age of the patient at the time of admission, e..."
Gender,"Gender of the patient, either 'Male' or 'Female'"
Blood Type,"Patient's blood type, which can be one of the ..."
Medical Condition,Specifies the primary medical condition or dia...
Date of Admission,Date on which the patient was admitted to the ...
Doctor,The name of the doctor responsible for the pat...
Hospital,Identifies the healthcare facility or hospital...
Insurance Provider,"Indicates the patient's insurance provider, wh..."
Billing Amount,Amount of money billed for the patient's healt...


In [9]:
len(dataset)

np.shape(dataset)

rows = len(dataset.axes[0])
cols = len(dataset.axes[1])

print("Number of Rows: ", rows)
print("Number of Columns: ", cols)

Number of Rows:  55500
Number of Columns:  15


In [10]:
dataset.isna().sum()



try:
  dataset.drop(columns=all, inplace=True)
except:
  KeyError("Column(s) already dropped.")


dataset.dropna(axis=0, inplace = True)



dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4