# Preprocessing

## Import Libraries

In [1]:
########################### Import Requisite Libraries #########################
import pandas as pd
import numpy as np
from datetime import datetime
import os
import sys
import pprint

################################################################################
# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))
from python_scripts.functions import *  # import generalized custom function
import kfre

################################################################################

print("Python version:")
print(sys.version)
print()
print(f"Pandas Version = {pd.__version__}")
print(f"Numpy Version = {np.__version__}")
print(f"KFRE Version = {kfre.__version__}")

Python version:
3.8.19 (default, Mar 20 2024, 19:55:45) [MSC v.1916 64 bit (AMD64)]

Pandas Version = 2.0.3
Numpy Version = 1.24.3
KFRE Version = 0.1.6


## Read in the Data

In [2]:
base_path = os.path.join(os.pardir)

# Go up 1 level from 'notebooks' to parent directory, then into the 'data' folder
data_path = os.path.join(os.pardir, "data")

image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(data_path, "12882_2021_2402_MOESM8_ESM.csv"))

Directory exists: ..\data
Directory exists: ..\images\png_images
Directory exists: ..\images\svg_images


In [3]:
df = add_patient_ids(df=df, seed=33)  # call the function on the df to add patient_ids

In [4]:
# Use slicing to select all rows except the last two, then reassign it to df
df = df[:-2]

In [5]:
df.head()

Unnamed: 0_level_0,Attendance date,Age,SEX,ETHNICITY,Diabetes (1=yes; 0=no),Hypertension (1=yes; 0=no),"Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Other=5)",eGFR-EPI,uPCR,Calcium (mmol/L),Phosphate (mmol/L),Bicarbonate (mmol/L),Albumin (g/l),Follow-up YEARS,RIP,ESRD
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
568268916,11/02/2015,87.24,Male,WHITE BRITISH,1.0,1.0,1.0,19.0,33.0,2.78,0.88,27.2,37.0,5.7,,
659549633,06/05/2015,56.88,Female,WHITE BRITISH,0.0,1.0,5.0,15.0,395.0,2.43,1.02,21.3,30.0,1.5,,1.0
406748956,04/05/2016,66.53,Female,WHITE BRITISH,0.0,1.0,3.0,17.0,163.0,2.33,1.24,27.8,36.0,0.6,1.0,
109228713,16/11/2011,69.92,Male,WHITE IRISH,0.0,1.0,3.0,12.0,250.0,2.29,1.8,20.7,39.0,1.1,,1.0
220533110,02/05/2018,81.14,Female,WHITE BRITISH,1.0,1.0,1.0,15.0,217.0,2.45,1.39,26.2,43.0,2.5,,


## Standardize Attendance Date Format

In [6]:
date_strings = df["Attendance date"].to_list()

# Apply the function with the rule to the list of dates
standardized_dates_with_rule = [parse_date_with_rule(date) for date in date_strings]

# Apply the updated function to standardize the dates and create a new column
df["Standardized_Date"] = df["Attendance date"].apply(parse_date_with_rule)

## Inspect Data Types and Missing Data

In [7]:
df_inspect = data_types(df)
df_inspect[df_inspect["# of Nulls"] > 0]  # inspect data for any missing values

Unnamed: 0,Column/Variable,Data Type,# of Nulls,Percent Null
14,RIP,float64,523,70.0
15,ESRD,float64,312,42.0


## Convert and Standardize Attendance Date to DateTime for EDA

In [8]:
df["Att_date"] = pd.to_datetime(df["Attendance date"], format="%d/%m/%Y")

In [9]:
df["Standardized_Date"] = pd.to_datetime(df["Standardized_Date"])
filtered_df = df[
    (df["Standardized_Date"] >= "2011-09-01")
    & (df["Standardized_Date"] <= "2015-10-31")
]

## Outcome Definition

In [10]:
# ESRD is outcome variable; missing values in this case mean 0 (does not have)
df["ESRD"] = df["ESRD"].apply(lambda x: 1 if x == 1 else 0)

## Reclassify Renal Diseases by Category

In [11]:
df["Renal_Disease"] = df["Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Other=5)"].map(
    {1: "DM", 2: "HTN", 3: "GN", 4: "APKD", 5: "Other"}
)

## One Hot Encoding

In [12]:
# one-hot encode sex and ethnicity
df = df.assign(**pd.get_dummies(df[["SEX", "ETHNICITY", "Renal_Disease"]]).astype(int))

## Convert Specific Variables from mmol/L to mg/g and/or g/dL

The conversion of uPCR from mg/mmol to mg/g involves understanding that both mg/mmol and mg/g are ratios that can be related through their units.

* mg/mmol is a ratio of mass (in milligrams) to molar concentration (in millimoles), while
* mg/g is a ratio of mass (in milligrams) to mass (in grams).

To convert mg/mmol to mg/g, we need to know the molar mass of creatinine, because uPCR is the ratio of the mass of protein to the mass of creatinine. The molar mass of creatinine is approximately 113.12 g/mol. Therefore, 1 mmol of creatinine is 113.12 mg.

Here's the conversion:

1 mg/mmol means that you have 1 mg of protein for every 1 mmol of creatinine. Since 1 mmol of creatinine is 113.12 mg:

$$\frac{\text{1 mg}}{\text{1 mmol creatinine}} \times \frac{\text{113.12 mg creatinine}}{\text{1 g creatinine}} = 113.12 \text{ mg/g}$$

However, since we are interested in a ratio where the denominator is 1 g (or 1000 mg) rather than 113.12 mg, we use the following calculation:

$$\frac{\text{1 mg protein}}{\text{0.11312 g creatinine}} \approx 8.84  {\text{ mg/g}}$$

In [13]:
from kfre import perform_conversions  # use kfre library to perform conversions

In [14]:
df = perform_conversions(
    df=df,
    reverse=False,
    upcr_col="uPCR_mmol",
    calcium_col="calcium_mmol",
    albumin_col="albumin_g_per_l",
    convert_all=True,
)

Converted 'uPCR' to new column 'uPCR_mg_g' with factor 8.84016973125884
Converted 'Calcium (mmol/L)' to new column 'Calcium_mg_dl' with factor 4
Converted 'Phosphate (mmol/L)' to new column 'Phosphate_mg_dl' with factor 3.1
Converted 'Albumin (g/l)' to new column 'Albumin_g_dl' with factor 0.1


## Validate `uPCR` to `uACR` Conversion

The `kfre` library has a built-in `upcr_uacr` method for converting `uPCR` to `uACR`. 

In [15]:
from kfre import upcr_uacr  # use kfre library to perform conversions

In [16]:
df["uACR"] = upcr_uacr(
    df=df,
    sex_col="SEX",
    diabetes_col="Diabetes (1=yes; 0=no)",
    hypertension_col="Hypertension (1=yes; 0=no)",
    upcr_col="uPCR_mg_g",
    female_str="Female",
)

In [17]:
# Creating a copy of the required subset of the DataFrame and renaming columns for clarity
upcr_uacr_validation = df[
    [
        "SEX",
        "uPCR_mg_g",
        "uACR",
        "Hypertension (1=yes; 0=no)",
        "Diabetes (1=yes; 0=no)",
    ]
].copy()

upcr_uacr_validation.rename(
    columns={
        "Hypertension (1=yes; 0=no)": "Hypertension Status",
        "Diabetes (1=yes; 0=no)": "Diabetes Status",
    },
    inplace=True,
)

# Applying transformations directly using map for efficiency
upcr_uacr_validation["Diabetes Status"] = upcr_uacr_validation["Diabetes Status"].map(
    {1: "Diabetic", 0: "Not Diabetic"}
)

upcr_uacr_validation["Hypertension Status"] = upcr_uacr_validation[
    "Hypertension Status"
].map({1: "Hypertensive", 0: "Not Hypertensive"})

In [18]:
upcr_uacr_validation.head()

Unnamed: 0_level_0,SEX,uPCR_mg_g,uACR,Hypertension Status,Diabetes Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
568268916,Male,291.725601,102.438624,Hypertensive,Diabetic
659549633,Female,3491.867044,1762.039423,Hypertensive,Not Diabetic
406748956,Female,1440.947666,659.136129,Hypertensive,Not Diabetic
109228713,Male,2210.042433,1145.245058,Hypertensive,Not Diabetic
220533110,Female,1918.316832,980.939665,Hypertensive,Diabetic


We can validate the results of this method by plugging each individual `uPCR (mg/g)` value into the online calculator below to confirm that the results match up. 

**`uPCR` to `uACR` Calculator:**

https://ckdpcrisk.org/pcr2acr_adj/

In [19]:
df["uACR"].median()

404.65034813307517

Ali et al. reported a median `uACR` of 409. There exists a small difference of 4 units between that and our median of 405 which can be attributed to rounding differences. Each individual conversion of `uPCR` to `uACR` in the above dataframe matches up to the online calculator references above.

## Validate Bicarbonate

In [20]:
med_bicarb = df["Bicarbonate (mmol/L)"].median()
print(f"Median Bicarbonate = {med_bicarb}")

Median Bicarbonate = 21.8


Ali et al. reported a median `Bicarbonate` of `21.8 mEq/L`. This is exactly what we see here.

## Binning and Creating Labels

In [21]:
bin_ages = [0, 18, 30, 40, 50, 60, 70, 80, 90, 100, float("inf")]
label_ages = [
    "< 18",
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
    "100 +",
]

In [22]:
df["age_group"] = pd.cut(df["Age"], bins=bin_ages, labels=label_ages)

In [23]:
pprint.pp(df.columns)

Index(['Attendance date', 'Age', 'SEX', 'ETHNICITY', 'Diabetes (1=yes; 0=no)',
       'Hypertension (1=yes; 0=no)',
       'Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Other=5)', 'eGFR-EPI',
       'uPCR', 'Calcium (mmol/L)', 'Phosphate (mmol/L)',
       'Bicarbonate (mmol/L)', 'Albumin (g/l)', 'Follow-up YEARS', 'RIP',
       'ESRD', 'Standardized_Date', 'Att_date', 'Renal_Disease', 'SEX_Female',
       'SEX_Male', 'ETHNICITY_AFRICAN (BLACK OR BLACK BRITISH)',
       'ETHNICITY_ANY OTHER ASIAN BACKGROUND',
       'ETHNICITY_ANY OTHER BLACK BACKGROUND',
       'ETHNICITY_ANY OTHER ETHNIC GROUP',
       'ETHNICITY_ANY OTHER MIXED BACKGROUND',
       'ETHNICITY_ANY OTHER WHITE BACKGROUND',
       'ETHNICITY_BANGLADESHI (ASIAN OR ASIAN BRITISH)',
       'ETHNICITY_CARIBBEAN (BLACK OR BLACK BRITISH)',
       'ETHNICITY_CHINESE (OTHER ETHNIC GROUPS)',
       'ETHNICITY_INDIAN (ASIAN OR ASIAN BRITISH)', 'ETHNICITY_NOT STATED',
       'ETHNICITY_PAKISTANI (ASIAN OR ASIAN BRITISH)',
       'ETH

## Prepare Dataframes for Output

In [24]:
df_eda = df.copy(deep=True)  # copy df that was worked on for further EDA
# isolate numeric datatypes as finalized preprocessed df that can be used for ML
df = df.select_dtypes(np.number)

## Save Dataframes to Path

In [25]:
df_eda.to_parquet(os.path.join(data_path, "df_eda.parquet"))  # save eda  df
df.to_parquet(os.path.join(data_path, "df.parquet"))  # save preprocessed num. df

## References

Ali, I., Donne, R. L., & Kalra, P. A. (2021). A validation study of the kidney failure risk equation in advanced chronic kidney disease according to disease aetiology with evaluation of discrimination, calibration and clinical utility. *BMC Nephrology, 22(1),* 194.  doi: 10.1186/s12882-021-02402-1

Sumida K, Nadkarni GN, Grams ME, Sang Y, Ballew SH, Coresh J, Matsushita K, Surapaneni A, Brunskill N, Chadban SJ, Chang AR, Cirillo M, Daratha KB, Gansevoort RT, Garg AX, Iacoviello L, Kayama T, Konta T, Kovesdy CP, Lash J, Lee BJ, Major RW, Metzger M, Miura K, Naimark DMJ, Nelson RG, Sawhney S, Stempniewicz N, Tang M, Townsend RR, Traynor JP, Valdivielso JM, Wetzels J, Polkinghorne KR, Heerspink HJL, for the Chronic Kidney Disease Prognosis Consortium. (2020). Conversion of urine protein-creatinine ratio or urine dipstick protein to urine albumin-creatinine ratio for use in chronic kidney disease screening and prognosis. *Ann Intern Med,* *173*(6), 426-435. doi: 10.7326/M20-0529.

Tangri, N., Stevens, L. A., Griffith, J., Tighiouart, H., Djurdjev, O., Naimark, D., Levin, A., & Levey, A. S. (2011). *A predictive model for progression of chronic kidney disease to kidney failure. JAMA,* 305(15), 1553-1559. doi: 10.1001/jama.2011.451.  

Tangri N, Grams ME, Levey AS, Coresh J, Appel LJ, Astor BC, Chodick G, Collins AJ, Djurdjev O, Elley CR, Evans M, Garg AX, Hallan SI, Inker LA, Ito S, Jee SH, Kovesdy CP, Kronenberg F, Heerspink HJL, Marks A, Nadkarni GN, Navaneethan SD, Nelson RG, Titze S, Sarnak MJ, Stengel B, Woodward M, Iseki K, for the CKD Prognosis Consortium. (2016). *Multinational assessment of accuracy of equations for predicting risk of kidney failure: A meta-analysis. JAMA,* 315(2), 164–174. doi: 10.1001/jama.2015.18202.

