# Exploratory Data Analysis

## Import Requisite Libraries

In [1]:
########################### Import Requisite Libraries #########################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import pprint
import warnings
import textwrap
import kfre
from kfre import *

################################################################################
# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))
from py_scripts.functions import *

################################################################################
print("Python version:")
print(sys.version)
print()
print(f"Pandas Version = {pd.__version__}")
print(f"Seaborn Version = {sns.__version__}")
print(f"Numpy Version = {np.__version__}")
print(f"KFRE Version = {kfre.__version__}")

Python version:
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]

Pandas Version = 2.2.2
Seaborn Version = 0.13.2
Numpy Version = 1.26.4
KFRE Version = 0.1.8


## Set Paths

In [2]:
base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder
data_path = os.path.join(os.pardir, "data")
data_output = os.path.join(os.pardir, "data_output")

image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(data_output)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

# Read the file into a DataFrame
df = pd.read_csv(os.path.join(data_path, "KFRE.csv")).set_index("Patient_ID")

Directory exists: ..\data
Directory exists: ..\data_output
Directory exists: ..\images\png_images
Directory exists: ..\images\svg_images


In [3]:
df.head()  # inspect first 5 rows of data

Unnamed: 0_level_0,Age,SEX,HTN,DM,GFR,ACR,Ca,P,Alb,TCO2,ESRD,ESRD_dur,sex_cat,Age_Group
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
723027400,61,1,1,0,9.148234,10.0,6.0,4.0,2.6,16.0,0,2086,Male,60-69
904532344,30,0,1,1,153.974871,29.0,9.4,4.5,4.0,24.0,0,2357,Female,30-39
890205650,55,1,0,0,66.810414,25.774259,9.2,1.6,4.3,18.0,0,265,Male,50-59
317865962,35,1,0,0,89.34706,28.451303,10.5,2.9,5.0,26.0,0,700,Male,30-39
968356501,66,1,1,0,7.316171,0.0,7.7,6.5,3.5,14.0,1,3,Male,60-69


In [4]:
df = add_kfre_risk_col(
    df=df,
    age_col="Age",
    sex_col="sex_cat",
    eGFR_col="GFR",
    uACR_col="ACR",
    dm_col="DM",
    htn_col="HTN",
    albumin_col="Alb",
    phosphorous_col="P",
    bicarbonate_col="TCO2",
    calcium_col="Ca",
    num_vars=[4, 6, 8],
    years=(2, 5),
    is_north_american=False,
    copy=True,  # Modify the original DataFrame directly
)
# The resulting DataFrame 'df' now includes new columns with risk
# predictions for each model and time frame
df = df.map(lambda x: f"{x:.6f}" if isinstance(x, float) else x)  # rem. sci not.

In [5]:
# Call the function with desired parameters
columns_to_highlight = df[[col for col in df.columns if "kfre" in col]]
df_styled = highlight_columns(
    df.head(),
    columns=columns_to_highlight,
    color="brown",
)

# Display the styled DataFrame
df_styled

Unnamed: 0_level_0,Age,SEX,HTN,DM,GFR,ACR,Ca,P,Alb,TCO2,ESRD,ESRD_dur,sex_cat,Age_Group,kfre_4var_2year,kfre_4var_5year,kfre_6var_2year,kfre_6var_5year,kfre_8var_2year,kfre_8var_5year
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
723027400,61,1,1,0,9.148234,10.0,6.0,4.0,2.6,16.0,0,2086,Male,60-69,0.12184,0.395347,0.131929,0.415467,0.581066,0.980034
904532344,30,0,1,1,153.974871,29.0,9.4,4.5,4.0,24.0,0,2357,Female,30-39,0.0,0.0,0.0,0.0,0.0,1e-06
890205650,55,1,0,0,66.810414,25.774259,9.2,1.6,4.3,18.0,0,265,Male,50-59,0.00037,0.001432,0.000362,0.001374,0.000585,0.002628
317865962,35,1,0,0,89.34706,28.451303,10.5,2.9,5.0,26.0,0,700,Male,30-39,4.9e-05,0.000189,4.9e-05,0.000184,4.5e-05,0.000202
968356501,66,1,1,0,7.316171,0.0,7.7,6.5,3.5,14.0,1,3,Male,60-69,9.9e-05,0.000385,9.9e-05,0.000377,0.004638,0.020696


## Save Out The Results

In [6]:
df.to_csv(os.path.join(data_path, "kfre_preds.csv"))  # save out to csv

In [7]:
df.to_excel(os.path.join(data_path, "kfre_preds.xlsx"), index=False)

## Challenges in Predicting KFRE with GFR Less Than 10

Predicting the Kidney Failure Risk Equation (KFRE) for patients with an estimated glomerular filtration rate (eGFR) less than 10 ml/min/1.73 m² can be challenging due to several factors:

1. **Data Sparsity**: Patients with eGFR less than 10 ml/min/1.73 m² are relatively rare in the general population compared to those with higher eGFR values. This means there is less data available to develop and validate predictive models for this group.

2. **Rapid Disease Progression**: At such low eGFR levels, patients are typically very close to requiring dialysis or a kidney transplant. The disease progression in these patients can be more rapid and variable, making it harder to predict outcomes accurately.

3. **Clinical Interventions**: Patients with eGFR less than 10 ml/min/1.73 m² are often under close medical supervision and may receive interventions that can alter the natural course of the disease, such as the initiation of dialysis or changes in medication. These interventions can add variability to the outcomes, complicating predictions.

4. **Physiological Changes**: At very low eGFR levels, the body's physiology can undergo significant changes, which may not be fully captured by the variables included in the KFRE. This can reduce the model's predictive accuracy.

Despite these challenges, the KFRE can still provide valuable information for patients with very low eGFR, but it is important to interpret the results with caution and in the context of other clinical information. It is always recommended to consult with healthcare professionals for personalized medical advice and decision-making.


## Performance Assessment

1. Define Truth for `2-Year` and `5-Year` Outcomes    
2. Extract the true labels for the `2-year` and `5-year` outcomes from the DataFrame `df`.

3. Assign the true labels for the `2-year` outcome to y_true_2_yr and for the `5-year` outcome to `y_true_5_yr`.
4. Combine these true labels into a list `y_true`.