In [None]:
import pandas as pd

from pathlib import Path

input_file = Path("../data/raw/healthcare_dataset.csv")
output_file = Path("../data/processed/healthcare_dataset.csv")
if not output_file.exists():
    output_file.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(input_file)
df["Name"] = df["Name"].str.title()
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
# since we assume that we have age at the date of admission
df["Year of Birth"] = pd.to_datetime(
    df["Date of Admission"] - df["Age"].apply(lambda x: pd.DateOffset(years=x))
).dt.year
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])
df["Room Number"] = df["Room Number"].astype(int)

# to distinguish persons let's add id based on unique features that we have
df["Patient ID"] = pd.factorize(
    df[["Name", "Year of Birth", "Gender", "Blood Type"]]
    .astype(str)
    .agg("-".join, axis=1)
)[0]

df.drop_duplicates(inplace=True)
df.rename({"Age": "Age on Admission Date"}, axis=1)
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)

columns = list(df.columns)
columns.insert(0, columns.pop())
columns.insert(2, columns.pop())

df = df[columns]
df.reset_index(drop=True, inplace=True)

df.to_csv(output_file, index=False)

In [None]:
df.head()

In [None]:
from ats.db_connector import load_df

In [None]:
df = load_df(output_file)

df.info()