In [3]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm


In [37]:
# 1.1 Data Transformation

def process_patient_file(file_path):
    """
    Reads and processes a single patient file.
    Returns a DataFrame with a regular 48-hour time grid.
    """
    # print(file_path)
    df = pd.read_csv(file_path)
    
    # Convert time format "HH:MM" to minutes
    df["Time"] = df["Time"].apply(lambda t: int(t.split(":")[0]) * 60 + int(t.split(":")[1]))

    # print(df[df.duplicated(subset=["Time", "Parameter"], keep=False)])

    df.drop_duplicates(subset=["Time", "Parameter"], keep="last", inplace=True) # TODO: rethink this step

    # Extract PatientID
    patient_id = df[df["Parameter"] == "RecordID"]["Value"].values[0]
    
    # Extract static variables (recorded at 00:00)
    static_vars = df[df["Time"] == 0]
    static_data = static_vars[static_vars["Parameter"].isin(["Age", "Gender", "Height", "ICUType"])]
    static_dict = {row["Parameter"]: row["Value"] for _, row in static_data.iterrows()}
    
    # Extract time-series variables
    time_series_df = df[df["Parameter"].isin(["Age", "Gender", "Height", "ICUType"]) == False]

    time_series_df.reset_index(inplace = True, drop = True)

    # print(time_series_df[time_series_df.duplicated(subset=["Time", "Parameter"], keep=False)])

    # print(time_series_df.head(100))
    
    # Pivot table to have columns as variables, rows as time steps
    time_series_df = time_series_df.pivot(index="Time", columns="Parameter", values="Value").reset_index()
    
    # Resample to hourly grid (from 0 to 2880 minutes)
    time_series_df = time_series_df.set_index("Time").reindex(range(0, 2881, 60)).interpolate().reset_index()
    
    # Rename time column to hours (from 0 to 48)
    time_series_df["Time"] = (time_series_df["Time"] // 60) + 1 # TODO double check this solution to rounding up time
    
    # Add static variables
    for key, value in static_dict.items():
        time_series_df[key] = value
    
    # Add PatientID
    time_series_df["PatientID"] = patient_id

    return time_series_df

def process_dataset(set_folder):
    """
    Process all patient files in a given set (set-a, set-b, set-c).
    Returns a DataFrame for the entire dataset.
    """
    files = list(Path(set_folder).glob("*.txt"))
    all_data = []
    for f in tqdm(range(len(files))):
        all_data.append(process_patient_file(files[f]))
        
    
    return pd.concat(all_data, ignore_index=True)

def main(data_path):
    """
    Main function to process each dataset (set-a, set-b, set-c) and save them as .parquet files. In 
    """
    # for set_name in ["set-a", "set-b", "set-c"]:
    #     set_path = os.path.join(data_path, set_name)
    #     print(f"Processing {set_name}...")
    #     df = process_dataset(set_path)
    #     save_path = "/".join(os.getcwd().split("/")[:-2] + ["data"]) # move two dirs out and into data (writable)
    #     parquet_file = os.path.join(save_path, f"{set_name}.parquet")
    #     df.to_parquet(parquet_file, engine="pyarrow")
    #     print(f"Saved {set_name} to {parquet_file}")

    for file_path in list(Path(data_path).glob("*.txt")):
        save_path = "/".join(os.getcwd().split("/")[:-2] + ["data/"]) + str(file_path).split("/")[-1].split(".")[0] + ".parquet"
        pd.read_csv(file_path)[["RecordID", "In-hospital_death"]].to_parquet(save_path, engine="pyarrow")
        
        # break
        
        

# Example usage
data_directory = "../../../ml4h_data/p1"
main(data_directory)


In [30]:
5//60 + 1

1