# Preprocessing

## Import Libraries

In [1]:
########################### Import Requisite Libraries #########################
import pandas as pd
import numpy as np
from datetime import datetime
import os
import sys
import pprint

################################################################################
# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))
from python_scripts.functions import *  # import generalized custom function
from python_scripts.kfre import *  # import kfre-specific methods

## Read in the Data

In [2]:
base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder
data_path = os.path.join(os.pardir, "data")

image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(data_path, "12882_2021_2402_MOESM8_ESM.csv"))

Directory exists: ..\data
Directory exists: ..\images\png_images
Directory exists: ..\images\svg_images


In [3]:
df = add_patient_ids(df=df, seed=33)  # call the function on the df to add patient_ids

In [4]:
# Use slicing to select all rows except the last two, then reassign it to df
df = df[:-2]

## Standardize Attendance Date Format

In [5]:
date_strings = df["Attendance date"].to_list()

# Apply the function with the rule to the list of dates
standardized_dates_with_rule = [parse_date_with_rule(date) for date in date_strings]

# Apply the updated function to standardize the dates and create a new column
df["Standardized_Date"] = df["Attendance date"].apply(parse_date_with_rule)

## Inspect Data Types and Missing Data

In [6]:
df_inspect = data_types(df)
df_inspect[df_inspect["# of Nulls"] > 0]  # inspect data for any missing values

Unnamed: 0,Column/Variable,Data Type,# of Nulls,Percent Null
14,RIP,float64,523,70.0
15,ESRD,float64,312,42.0


## Convert and Standardize Attendance Date to DateTime for EDA

In [7]:
df["Att_date"] = pd.to_datetime(df["Attendance date"], format="%d/%m/%Y")

In [8]:
df["Standardized_Date"] = pd.to_datetime(df["Standardized_Date"])
filtered_df = df[
    (df["Standardized_Date"] >= "2011-09-01")
    & (df["Standardized_Date"] <= "2015-10-31")
]

## Outcome Definition

In [9]:
# ESRD is outcome variable; missing values in this case mean 0 (does not have)
df["ESRD"] = df["ESRD"].apply(lambda x: 1 if x == 1 else 0)

## Reclassify Renal Diseases by Category

In [10]:
df["Renal_Disease"] = df["Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Other=5)"].map(
    {1: "DM", 2: "HTN", 3: "GN", 4: "APKD", 5: "Other"}
)

## One Hot Encoding

In [11]:
# one-hot encode sex and ethnicity
df = df.assign(**pd.get_dummies(df[["SEX", "ETHNICITY", "Renal_Disease"]]).astype(int))

## Convert Specific Variables from mmol/L to mg/g and/or g/dL

In [12]:
conversion_columns = {
    "uPCR_mmol": "uPCR",
    "calcium_mmol": "Calcium (mmol/L)",
    "phosphate_mmol": "Phosphate (mmol/L)",
    "albumin_g_per_l": "Albumin (g/l)",
}

predictor = RiskPredictor(data=df, columns=conversion_columns)
predictor.perform_conversions()

## uPCR to uACR

In [13]:
df["uACR"] = df.apply(
    lambda row: uPCR_to_uACR(
        row=row,
        sex_col="SEX",
        diabetes_col="Diabetes (1=yes; 0=no)",
        hypertension_col="Hypertension (1=yes; 0=no)",
        uPCR_col="uPCR (mg/g)",
        female_str="Female",
    ),
    axis=1,
)

## Binning and Creating Labels

In [14]:
bin_ages = [0, 18, 30, 40, 50, 60, 70, 80, 90, 100, float("inf")]
label_ages = [
    "< 18",
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
    "100 +",
]

In [15]:
df["age_group"] = pd.cut(df["Age"], bins=bin_ages, labels=label_ages)

In [16]:
pprint.pp(df.columns)

Index(['Attendance date', 'Age', 'SEX', 'ETHNICITY', 'Diabetes (1=yes; 0=no)',
       'Hypertension (1=yes; 0=no)',
       'Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Other=5)', 'eGFR-EPI',
       'uPCR', 'Calcium (mmol/L)', 'Phosphate (mmol/L)',
       'Bicarbonate (mmol/L)', 'Albumin (g/l)', 'Follow-up YEARS', 'RIP',
       'ESRD', 'Standardized_Date', 'Att_date', 'Renal_Disease', 'SEX_Female',
       'SEX_Male', 'ETHNICITY_AFRICAN (BLACK OR BLACK BRITISH)',
       'ETHNICITY_ANY OTHER ASIAN BACKGROUND',
       'ETHNICITY_ANY OTHER BLACK BACKGROUND',
       'ETHNICITY_ANY OTHER ETHNIC GROUP',
       'ETHNICITY_ANY OTHER MIXED BACKGROUND',
       'ETHNICITY_ANY OTHER WHITE BACKGROUND',
       'ETHNICITY_BANGLADESHI (ASIAN OR ASIAN BRITISH)',
       'ETHNICITY_CARIBBEAN (BLACK OR BLACK BRITISH)',
       'ETHNICITY_CHINESE (OTHER ETHNIC GROUPS)',
       'ETHNICITY_INDIAN (ASIAN OR ASIAN BRITISH)', 'ETHNICITY_NOT STATED',
       'ETHNICITY_PAKISTANI (ASIAN OR ASIAN BRITISH)',
       'ETH

## Prepare Dataframes for Output

In [17]:
df_eda = df.copy(deep=True)  # copy df that was worked on for further EDA
# isolate numeric datatypes as finalized preprocessed df that can be used for ML
df = df.select_dtypes(np.number)

## Save Dataframes to Path

In [18]:
df_eda.to_parquet(os.path.join(data_path, "df_eda.parquet"))  # save eda  df
df.to_parquet(os.path.join(data_path, "df.parquet"))  # save preprocessed num. df