In [83]:
import pandas as pd

from pathlib import Path

input_file = Path("../data/raw/healthcare_dataset.csv")
output_file = Path("../data/processed/healthcare_dataset.csv")
if not output_file.exists():
    output_file.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(input_file)

# apparently poor randomization in synthetic generation 
# where 5.5k records are duplicates with different Age
# (it would be ok as typo representation, but not 10% of the dataset with the same problem)
cols_to_check = df.columns.drop('Age').tolist()
df = df.drop_duplicates(subset=cols_to_check)

df["Name"] = df["Name"].str.title()
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
# since we assume that we have age at the date of admission
df["Year of Birth"] = pd.to_datetime(
    df["Date of Admission"] - df["Age"].apply(lambda x: pd.DateOffset(years=x))
).dt.year
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])
df["Room Number"] = df["Room Number"].astype(int)

# to distinguish persons let's add id based on unique features that we have
df["Patient ID"] = pd.factorize(
    df[["Name", "Year of Birth", "Gender", "Blood Type"]]
    .astype(str)
    .agg("-".join, axis=1)
)[0]

df.drop_duplicates(inplace=True)
df.rename({"Age": "Age on Admission Date"}, axis=1)
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)

columns = list(df.columns)
columns.insert(0, columns.pop())
columns.insert(2, columns.pop())

df = df[columns]
df.reset_index(drop=True, inplace=True)

df.to_csv(output_file, index=False)

  df["Date of Admission"] - df["Age"].apply(lambda x: pd.DateOffset(years=x))


In [84]:
df.head()

Unnamed: 0,Patient_ID,Name,Year_of_Birth,Age,Gender,Blood_Type,Medical_Condition,Date_of_Admission,Doctor,Hospital,Insurance_Provider,Billing_Amount,Room_Number,Admission_Type,Discharge_Date,Medication,Test_Results
0,0,Bobby Jackson,1994,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,1,Leslie Terry,1957,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,2,Danny Smith,1946,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,3,Andrew Watts,1992,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,4,Adrienne Bell,1979,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [85]:
df.shape

(50000, 17)

In [None]:
from ats.db_connector import load_df

In [None]:
df = load_df(output_file)

df.info()