In [None]:
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import re

In [None]:
# File paths
balanced_path = 'balance_data.csv'
balanced_data = pd.read_csv(balanced_path)

In [None]:
# Step 1: Replace separators to ensure consistent delimiter
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r'[\\/]', ';', regex=True)
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r',', ';', regex=True)

# Step 2: Apply corrections for known misspellings and abbreviations
corrections = {
    r'\brenal failiure\b': 'renal failure',
    r'\baromegley\b': 'acromegaly',
    r'\bvf arrest\b': 'ventricular fibrillation arrest',
    r'\bs/p\b': 'status post',
    r'\bcath\b': 'catheterization',
    r'\bmi\b': 'myocardial infarction',
    r'\bchf\b': 'congestive heart failure',
    r'\boa\b': 'osteoarthritis'
}
for pattern, replacement in corrections.items():
    balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(pattern, replacement, regex=True)

# Step 3: Map similar diagnoses to standard terms using word boundaries to prevent partial matches
standard_terms = {
    r'\bsepsis\b': 'sepsis',
    r'\burinary tract infection\b|\buti\b': 'urinary tract infection',
    r'\bheart failure\b|\bcongestive heart failure\b': 'heart failure',
    r'\bstroke\b|\bcva\b|\btia\b': 'stroke',
    r'\bcopd\b|\bchronic obstructive pulmonary disease\b': 'COPD',
    r'\bkidney failure\b|\brenal failure\b': 'renal failure',
    r'\bventricular fibrillation arrest\b': 'cardiac arrest',
    r'\bhypertension\b|\bhigh blood pressure\b': 'hypertension',
    r'\bpneumonia\b': 'pneumonia',
    r'\bbrain metastasis\b|\bmetastatic brain tumor\b': 'brain metastasis',
    r'\bmyocardial infarction\b': 'myocardial infarction'
}
for pattern, replacement in standard_terms.items():
    balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(pattern, replacement, regex=True)

# Step 4: Remove extra spaces and set to lowercase
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

# Step 5: Remove duplicate diagnoses within each entry (e.g., "sepsis; sepsis" becomes "sepsis")
def remove_duplicates(diagnosis):
    conditions = diagnosis.split(';')
    unique_conditions = sorted(set([cond.strip() for cond in conditions if cond]))
    return '; '.join(unique_conditions)

balanced_data['diagnosis'] = balanced_data['diagnosis'].apply(remove_duplicates)

# Display unique values to verify
print(balanced_data['diagnosis'].unique())

['sepsis' 'hepatitis b' 'humeral fracture' 'alcoholic hepatitis' 'stroke'
 'coronary artery bypass graft with mvr ? mitral valve replacement; coronary artery disease; mitral regurgitation; sda'
 'syncope; telemetry' 'right humerous fracture'
 'renal failure-syncope-hyperkalemia'
 'pre hydration; recurrent left carotid stenosis' 'failure to thrive'
 'catheterization; pulmonary edema' 'unstable angina'
 'respiratory distress' 'brain metastasis; metastatic melanoma' 'fever'
 'brain metastases' 'lower gi bleed' 'catheterization; chest pain'
 'p fall; s; subdural hematoma' 'esophageal cancer; sda'
 'p motorcycle accident; s' 'seizure' 'gastrointestinal bleed'
 'lung cancer; shortness of breath' 'hypotension' 'heart failure'
 'urosepsis' 'basal ganglin bleed' 'overdose'
 'critical aortic stenosis; hypotension' 'sepsis; telemetry'
 'status post motor vehicle accident with injuries' 'tachypnea; telemetry'
 'chronic myelogenous leukemia; transfusion reaction'
 'hyponatremia; urinary tract infec

In [None]:
# Further corrections for remaining abbreviations and fragments
additional_corrections = {
    r'\bp fall\b': 'fall',
    r'\bp motorcycle accident\b': 'motorcycle accident',
    r'\bp motor vehicle accident\b': 'motor vehicle accident',
    r'\bs\b': '',  # Remove standalone "s" entries left from partial replacements
    r'\bhypoglcemia\b': 'hypoglycemia',
    r'\bmetastic\b': 'metastatic'
}

# Apply additional corrections
for pattern, replacement in additional_corrections.items():
    balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(pattern, replacement, regex=True)

# Remove extra spaces and lowercase for final cleanup
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

# Verify the final unique values
print(balanced_data['diagnosis'].unique())


['sepsis' 'hepatitis b' 'humeral fracture' 'alcoholic hepatitis' 'stroke'
 'coronary artery bypass graft with mvr ? mitral valve replacement; coronary artery disease; mitral regurgitation; sda'
 'syncope; telemetry' 'right humerous fracture'
 'renal failure-syncope-hyperkalemia'
 'pre hydration; recurrent left carotid stenosis' 'failure to thrive'
 'catheterization; pulmonary edema' 'unstable angina'
 'respiratory distress' 'brain metastasis; metastatic melanoma' 'fever'
 'brain metastases' 'lower gi bleed' 'catheterization; chest pain'
 'fall; ; subdural hematoma' 'esophageal cancer; sda'
 'motorcycle accident;' 'seizure' 'gastrointestinal bleed'
 'lung cancer; shortness of breath' 'hypotension' 'heart failure'
 'urosepsis' 'basal ganglin bleed' 'overdose'
 'critical aortic stenosis; hypotension' 'sepsis; telemetry'
 'status post motor vehicle accident with injuries' 'tachypnea; telemetry'
 'chronic myelogenous leukemia; transfusion reaction'
 'hyponatremia; urinary tract infection' '

In [None]:
# Remove trailing and isolated semicolons
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r';\s*$', '', regex=True)  # Remove trailing semicolons
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r'\b;\b', '', regex=True)  # Remove isolated semicolons

# Additional corrections for remaining misspellings and specific cases
final_corrections = {
    r'\bbasal ganglin bleed\b': 'basal ganglia bleed',
    r'\bhypotension-syncope-hyperkalemia\b': 'hypotension; syncope; hyperkalemia',
    r'\besophageal ca\b': 'esophageal cancer',
    r'\binferior myocardial infarction\b': 'myocardial infarction',
    r'\bacromegly\b': 'acromegaly'
}

# Apply the final corrections
for pattern, replacement in final_corrections.items():
    balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(pattern, replacement, regex=True)

# Final whitespace cleanup
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

# Verify the final unique values
print(balanced_data['diagnosis'].unique())

['sepsis' 'hepatitis b' 'humeral fracture' 'alcoholic hepatitis' 'stroke'
 'coronary artery bypass graft with mvr ? mitral valve replacement; coronary artery disease; mitral regurgitation; sda'
 'syncope; telemetry' 'right humerous fracture'
 'renal failure-syncope-hyperkalemia'
 'pre hydration; recurrent left carotid stenosis' 'failure to thrive'
 'catheterization; pulmonary edema' 'unstable angina'
 'respiratory distress' 'brain metastasis; metastatic melanoma' 'fever'
 'brain metastases' 'lower gi bleed' 'catheterization; chest pain'
 'fall; ; subdural hematoma' 'esophageal cancer; sda'
 'motorcycle accident' 'seizure' 'gastrointestinal bleed'
 'lung cancer; shortness of breath' 'hypotension' 'heart failure'
 'urosepsis' 'basal ganglia bleed' 'overdose'
 'critical aortic stenosis; hypotension' 'sepsis; telemetry'
 'status post motor vehicle accident with injuries' 'tachypnea; telemetry'
 'chronic myelogenous leukemia; transfusion reaction'
 'hyponatremia; urinary tract infection' 'h

In [None]:
# Dictionary of abbreviations to their full forms
abbreviation_mapping = {
    r'\bmvr\b': 'mitral valve replacement',
    r'\bcabg\b': 'coronary artery bypass graft',
    # Add other abbreviations if needed
}

# Replace abbreviations with their full forms
for abbr, full_form in abbreviation_mapping.items():
    balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(abbr, full_form, regex=True)

# Remove any lingering "?" and extra whitespace
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r'\?', '', regex=True)
balanced_data['diagnosis'] = balanced_data['diagnosis'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

# Verify final unique values
print(balanced_data['diagnosis'].unique())


['sepsis' 'hepatitis b' 'humeral fracture' 'alcoholic hepatitis' 'stroke'
 'coronary artery bypass graft with mitral valve replacement mitral valve replacement; coronary artery disease; mitral regurgitation; sda'
 'syncope; telemetry' 'right humerous fracture'
 'renal failure-syncope-hyperkalemia'
 'pre hydration; recurrent left carotid stenosis' 'failure to thrive'
 'catheterization; pulmonary edema' 'unstable angina'
 'respiratory distress' 'brain metastasis; metastatic melanoma' 'fever'
 'brain metastases' 'lower gi bleed' 'catheterization; chest pain'
 'fall; ; subdural hematoma' 'esophageal cancer; sda'
 'motorcycle accident' 'seizure' 'gastrointestinal bleed'
 'lung cancer; shortness of breath' 'hypotension' 'heart failure'
 'urosepsis' 'basal ganglia bleed' 'overdose'
 'critical aortic stenosis; hypotension' 'sepsis; telemetry'
 'status post motor vehicle accident with injuries' 'tachypnea; telemetry'
 'chronic myelogenous leukemia; transfusion reaction'
 'hyponatremia; urinary 

In [None]:
# Function to remove duplicate phrases within each diagnosis
def remove_duplicate_phrases(diagnosis):
    phrases = diagnosis.split(';')
    unique_phrases = sorted(set([phrase.strip() for phrase in phrases if phrase.strip()]))
    return '; '.join(unique_phrases)

# Apply the function to the 'diagnosis' column
balanced_data['diagnosis'] = balanced_data['diagnosis'].apply(remove_duplicate_phrases)

# Verify the final unique values to confirm duplicates are removed
print(balanced_data['diagnosis'].unique())


['sepsis' 'hepatitis b' 'humeral fracture' 'alcoholic hepatitis' 'stroke'
 'coronary artery bypass graft with mitral valve replacement mitral valve replacement; coronary artery disease; mitral regurgitation; sda'
 'syncope; telemetry' 'right humerous fracture'
 'renal failure-syncope-hyperkalemia'
 'pre hydration; recurrent left carotid stenosis' 'failure to thrive'
 'catheterization; pulmonary edema' 'unstable angina'
 'respiratory distress' 'brain metastasis; metastatic melanoma' 'fever'
 'brain metastases' 'lower gi bleed' 'catheterization; chest pain'
 'fall; subdural hematoma' 'esophageal cancer; sda' 'motorcycle accident'
 'seizure' 'gastrointestinal bleed' 'lung cancer; shortness of breath'
 'hypotension' 'heart failure' 'urosepsis' 'basal ganglia bleed'
 'overdose' 'critical aortic stenosis; hypotension' 'sepsis; telemetry'
 'status post motor vehicle accident with injuries' 'tachypnea; telemetry'
 'chronic myelogenous leukemia; transfusion reaction'
 'hyponatremia; urinary tra

In [None]:
balanced_data.emergency_duration_label

Unnamed: 0,emergency_duration_label
0,moderate duration of 6.283333333333333 hours
1,short duration of 0.0 hours
2,short duration of 0.0 hours
3,moderate duration of 7.616666666666666 hours
4,short duration of 0.0 hours
...,...
231,moderate duration of 7.65 hours
232,moderate duration of 4.183333333333334 hours
233,short duration of 0.0 hours
234,moderate duration of 6.716666666666667 hours


In [None]:
# Round the duration to two decimal places and update the label
balanced_data['emergency_duration_label'] = balanced_data['emergency_duration_label'].apply(
    lambda x: ' '.join([x.split()[0], x.split()[1], "of", f"{float(x.split()[3]):.2f}", "hours"])
)

# Display a few rows to verify
print(balanced_data['emergency_duration_label'].head())

0    moderate duration of 6.28 hours
1       short duration of 0.00 hours
2       short duration of 0.00 hours
3    moderate duration of 7.62 hours
4       short duration of 0.00 hours
Name: emergency_duration_label, dtype: object


In [None]:
# Function to format emergency duration label
def format_duration_label(label):
    parts = label.split()
    duration = float(parts[3])
    if duration == 0.0:
        return "none"
    else:
        return f"{parts[0]} {parts[1]} of {duration:.2f} hours"

# Apply the function to update the column
balanced_data['emergency_duration_label'] = balanced_data['emergency_duration_label'].apply(format_duration_label)

# Display a few rows to verify the result
print(balanced_data['emergency_duration_label'].head())


0    moderate duration of 6.28 hours
1                               none
2                               none
3    moderate duration of 7.62 hours
4                               none
Name: emergency_duration_label, dtype: object


In [None]:
balanced_data.head(10)

Unnamed: 0,subject_id,hadm_id,admission_type,admission_location,discharge_location,insurance,marital_status,diagnosis,has_chartevents_data,readmitted_within_30_days,...,vital_mean_223762,vital_mean_224690,vital_last_220045,vital_last_220179,vital_last_223762,vital_last_224690,prior_admissions,length_of_stay_label,medication_diversity_label,emergency_duration_label
0,10006.0,142345.0,emergency,emergency room admission,home health care,medicare,separated,sepsis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 9 days,moderate medication diversity of 34 unique med...,moderate duration of 6.28 hours
1,10011.0,105331.0,emergency,transfer from another hospital or external fac...,deceased,private,single,hepatitis b,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 14 days,low medication diversity of 0 unique medications,none
2,10013.0,165520.0,emergency,transfer from another hospital or external fac...,deceased,medicare,unknown,sepsis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,short stay of 3 days,low medication diversity of 20 unique medications,none
3,10017.0,199207.0,emergency,emergency room admission,skilled nursing facility,medicare,divorced,humeral fracture,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 8 days,high medication diversity of 52 unique medicat...,moderate duration of 7.62 hours
4,10019.0,177759.0,emergency,transfer from another hospital or external fac...,deceased,medicare,divorced,alcoholic hepatitis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,short stay of 1 days,moderate medication diversity of 32 unique med...,none
5,10026.0,103770.0,emergency,emergency room admission,rehabilitation or distinct part hospital,medicare,unknown,stroke,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,medium stay of 7 days,low medication diversity of 26 unique medications,moderate duration of 6.67 hours
6,10027.0,199395.0,elective,physician referral / normal delivery,skilled nursing facility,medicare,married,coronary artery bypass graft with mitral valve...,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 12 days,moderate medication diversity of 45 unique med...,none
7,10029.0,132349.0,emergency,emergency room admission,skilled nursing facility,medicare,divorced,syncope; telemetry,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 10 days,moderate medication diversity of 32 unique med...,moderate duration of 5.78 hours
8,10032.0,140372.0,emergency,emergency room admission,rehabilitation or distinct part hospital,medicare,widowed,right humerous fracture,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 13 days,moderate medication diversity of 30 unique med...,moderate duration of 5.73 hours
9,10033.0,157235.0,emergency,emergency room admission,skilled nursing facility,medicare,married,renal failure-syncope-hyperkalemia,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,medium stay of 4 days,low medication diversity of 21 unique medications,moderate duration of 7.90 hours


In [None]:
# Function to format medication diversity label
def format_medication_diversity_label(label):
    parts = label.split()
    diversity_count = int(parts[4])
    if diversity_count == 0:
        return "none"
    else:
        return f"{parts[0]} {parts[1]} {parts[2]} of {diversity_count} unique medications"

# Apply the function to update the column
balanced_data['medication_diversity_label'] = balanced_data['medication_diversity_label'].apply(format_medication_diversity_label)

# Display a few rows to verify the result
print(balanced_data['medication_diversity_label'].head())

0    moderate medication diversity of 34 unique med...
1                                                 none
2    low medication diversity of 20 unique medications
3    high medication diversity of 52 unique medicat...
4    moderate medication diversity of 32 unique med...
Name: medication_diversity_label, dtype: object


In [None]:
balanced_data.head(10)

Unnamed: 0,subject_id,hadm_id,admission_type,admission_location,discharge_location,insurance,marital_status,diagnosis,has_chartevents_data,readmitted_within_30_days,...,vital_mean_223762,vital_mean_224690,vital_last_220045,vital_last_220179,vital_last_223762,vital_last_224690,prior_admissions,length_of_stay_label,medication_diversity_label,emergency_duration_label
0,10006.0,142345.0,emergency,emergency room admission,home health care,medicare,separated,sepsis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 9 days,moderate medication diversity of 34 unique med...,moderate duration of 6.28 hours
1,10011.0,105331.0,emergency,transfer from another hospital or external fac...,deceased,private,single,hepatitis b,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 14 days,none,none
2,10013.0,165520.0,emergency,transfer from another hospital or external fac...,deceased,medicare,unknown,sepsis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,short stay of 3 days,low medication diversity of 20 unique medications,none
3,10017.0,199207.0,emergency,emergency room admission,skilled nursing facility,medicare,divorced,humeral fracture,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 8 days,high medication diversity of 52 unique medicat...,moderate duration of 7.62 hours
4,10019.0,177759.0,emergency,transfer from another hospital or external fac...,deceased,medicare,divorced,alcoholic hepatitis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,short stay of 1 days,moderate medication diversity of 32 unique med...,none
5,10026.0,103770.0,emergency,emergency room admission,rehabilitation or distinct part hospital,medicare,unknown,stroke,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,medium stay of 7 days,low medication diversity of 26 unique medications,moderate duration of 6.67 hours
6,10027.0,199395.0,elective,physician referral / normal delivery,skilled nursing facility,medicare,married,coronary artery bypass graft with mitral valve...,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 12 days,moderate medication diversity of 45 unique med...,none
7,10029.0,132349.0,emergency,emergency room admission,skilled nursing facility,medicare,divorced,syncope; telemetry,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 10 days,moderate medication diversity of 32 unique med...,moderate duration of 5.78 hours
8,10032.0,140372.0,emergency,emergency room admission,rehabilitation or distinct part hospital,medicare,widowed,right humerous fracture,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,long stay of 13 days,moderate medication diversity of 30 unique med...,moderate duration of 5.73 hours
9,10033.0,157235.0,emergency,emergency room admission,skilled nursing facility,medicare,married,renal failure-syncope-hyperkalemia,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,0.0,medium stay of 4 days,low medication diversity of 21 unique medications,moderate duration of 7.90 hours


In [None]:
# Function to format prior admissions
def format_prior_admissions(count):
    if count == 0:
        return "no"
    else:
        return f"{count} times"

# Apply the function to update the column
balanced_data['prior_admissions'] = balanced_data['prior_admissions'].apply(format_prior_admissions)

# Display a few rows to verify the result
print(balanced_data['prior_admissions'].head())


0    no
1    no
2    no
3    no
4    no
Name: prior_admissions, dtype: object


In [None]:
balanced_data.head(20)

Unnamed: 0,subject_id,hadm_id,admission_type,admission_location,discharge_location,insurance,marital_status,diagnosis,has_chartevents_data,readmitted_within_30_days,...,vital_mean_223762,vital_mean_224690,vital_last_220045,vital_last_220179,vital_last_223762,vital_last_224690,prior_admissions,length_of_stay_label,medication_diversity_label,emergency_duration_label
0,10006.0,142345.0,emergency,emergency room admission,home health care,medicare,separated,sepsis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,long stay of 9 days,moderate medication diversity of 34 unique med...,moderate duration of 6.28 hours
1,10011.0,105331.0,emergency,transfer from another hospital or external fac...,deceased,private,single,hepatitis b,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,long stay of 14 days,none,none
2,10013.0,165520.0,emergency,transfer from another hospital or external fac...,deceased,medicare,unknown,sepsis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,short stay of 3 days,low medication diversity of 20 unique medications,none
3,10017.0,199207.0,emergency,emergency room admission,skilled nursing facility,medicare,divorced,humeral fracture,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,long stay of 8 days,high medication diversity of 52 unique medicat...,moderate duration of 7.62 hours
4,10019.0,177759.0,emergency,transfer from another hospital or external fac...,deceased,medicare,divorced,alcoholic hepatitis,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,short stay of 1 days,moderate medication diversity of 32 unique med...,none
5,10026.0,103770.0,emergency,emergency room admission,rehabilitation or distinct part hospital,medicare,unknown,stroke,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,medium stay of 7 days,low medication diversity of 26 unique medications,moderate duration of 6.67 hours
6,10027.0,199395.0,elective,physician referral / normal delivery,skilled nursing facility,medicare,married,coronary artery bypass graft with mitral valve...,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,long stay of 12 days,moderate medication diversity of 45 unique med...,none
7,10029.0,132349.0,emergency,emergency room admission,skilled nursing facility,medicare,divorced,syncope; telemetry,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,long stay of 10 days,moderate medication diversity of 32 unique med...,moderate duration of 5.78 hours
8,10032.0,140372.0,emergency,emergency room admission,rehabilitation or distinct part hospital,medicare,widowed,right humerous fracture,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,long stay of 13 days,moderate medication diversity of 30 unique med...,moderate duration of 5.73 hours
9,10033.0,157235.0,emergency,emergency room admission,skilled nursing facility,medicare,married,renal failure-syncope-hyperkalemia,yes,no,...,37.443333,20.0,79.5,115.0,36.3,18.0,no,medium stay of 4 days,low medication diversity of 21 unique medications,moderate duration of 7.90 hours


In [None]:
balanced_data.shape

(236, 32)

In [None]:
# Check for duplicate rows in the entire DataFrame
duplicates = balanced_data[balanced_data.duplicated()]

# Count of duplicate rows
print("Number of duplicate rows:", duplicates.shape[0])

Number of duplicate rows: 0


In [None]:
# Save cleaned dataset to CSV
balanced_data.to_csv('clean_balance_data.csv', index=False)