# Preprocessing

## Import Libraries

In [1]:
########################### Import Requisite Libraries #########################
import pandas as pd
import numpy as np
import os
import sys
import pprint

################################################################################
# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))
from python_scripts.functions import *

## Read in the Data

In [2]:
base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder
data_path = os.path.join(os.pardir, "data")

image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(data_path, "12882_2021_2402_MOESM8_ESM.csv"))

Directory exists: ..\data
Directory exists: ..\images\png_images
Directory exists: ..\images\svg_images


In [3]:
df = add_patient_ids(df=df, seed=33)  # call the function on the df to add patient_ids

In [4]:
df.head()

Unnamed: 0_level_0,Attendance date,Age,SEX,ETHNICITY,Diabetes (1=yes; 0=no),Hypertension (1=yes; 0=no),"Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Other=5)",eGFR-EPI,uPCR,Calcium (mmol/L),Phosphate (mmol/L),Bicarbonate (mmol/L),Albumin (g/l),Follow-up YEARS,RIP,ESRD
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
568268916,11/02/2015,87.24,Male,WHITE BRITISH,1.0,1.0,1.0,19.0,33.0,2.78,0.88,27.2,37.0,5.7,,
659549633,06/05/2015,56.88,Female,WHITE BRITISH,0.0,1.0,5.0,15.0,395.0,2.43,1.02,21.3,30.0,1.5,,1.0
406748956,04/05/2016,66.53,Female,WHITE BRITISH,0.0,1.0,3.0,17.0,163.0,2.33,1.24,27.8,36.0,0.6,1.0,
109228713,16/11/2011,69.92,Male,WHITE IRISH,0.0,1.0,3.0,12.0,250.0,2.29,1.8,20.7,39.0,1.1,,1.0
220533110,02/05/2018,81.14,Female,WHITE BRITISH,1.0,1.0,1.0,15.0,217.0,2.45,1.39,26.2,43.0,2.5,,


## Inspect Data Types and Missing Data

In [5]:
df_inspect = data_types(df)
df_inspect[df_inspect["# of Nulls"] > 0]  # inspect data for any missing values

Unnamed: 0,Column/Variable,Data Type,# of Nulls,Percent Null
0,Attendance date,object,2,0.0
1,Age,float64,2,0.0
2,SEX,object,2,0.0
3,ETHNICITY,object,2,0.0
4,Diabetes (1=yes; 0=no),float64,2,0.0
5,Hypertension (1=yes; 0=no),float64,2,0.0
6,"Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Oth...",float64,2,0.0
7,eGFR-EPI,float64,2,0.0
8,uPCR,float64,2,0.0
9,Calcium (mmol/L),float64,2,0.0


## Outcome Definition

In [6]:
# ESRD is outcome variable; missing values in this case mean 0 (does not have)
df["ESRD"] = df["ESRD"].apply(lambda x: 1 if x == 1 else 0)

## One Hot Encoding

In [7]:
# one-hot encode sex and ethnicity
df = df.assign(**pd.get_dummies(df[["SEX", "ETHNICITY"]]).astype(int))

## uPCR to uACR

In [8]:
# calculate uACR from uPCR from Ali et al.'s formula
df["uACR"] = (
    np.exp(
        5.2659
        + 0.2934 * np.log(np.minimum(df["uPCR"] / 50, 1))
        + 1.5643 * np.log(np.maximum(np.minimum(df["uPCR"] / 500, 1), 0.1))
        + 1.1109 * np.log(np.maximum(df["uPCR"] / 500, 1))
    )
    - 0.0773 * (df["SEX"] == "Female")
    + 0.0797 * (df["Diabetes (1=yes; 0=no)"] == 1)
    + 0.1265 * (df["Hypertension (1=yes; 0=no)"] == 1)
)
# This sets the uACR column for all entries, adjusting for the constants as given.

In [9]:
df["log_uACR"] = np.log(df["uACR"])

## Binning and Creating Labels

In [10]:
bin_ages = [0, 18, 30, 40, 50, 60, 70, 80, 90, 100, float("inf")]
label_ages = [
    "Under 18",
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
    "100 +",
]

In [11]:
df["age_group"] = pd.cut(df["Age"], bins=bin_ages, labels=label_ages)

In [12]:
pprint.pp(df.columns)

Index(['Attendance date', 'Age', 'SEX', 'ETHNICITY', 'Diabetes (1=yes; 0=no)',
       'Hypertension (1=yes; 0=no)',
       'Renal disease (DM=1, HTN=2, GN=3, ADPKD=4, Other=5)', 'eGFR-EPI',
       'uPCR', 'Calcium (mmol/L)', 'Phosphate (mmol/L)',
       'Bicarbonate (mmol/L)', 'Albumin (g/l)', 'Follow-up YEARS', 'RIP',
       'ESRD', 'SEX_Female', 'SEX_Male',
       'ETHNICITY_AFRICAN (BLACK OR BLACK BRITISH)',
       'ETHNICITY_ANY OTHER ASIAN BACKGROUND',
       'ETHNICITY_ANY OTHER BLACK BACKGROUND',
       'ETHNICITY_ANY OTHER ETHNIC GROUP',
       'ETHNICITY_ANY OTHER MIXED BACKGROUND',
       'ETHNICITY_ANY OTHER WHITE BACKGROUND',
       'ETHNICITY_BANGLADESHI (ASIAN OR ASIAN BRITISH)',
       'ETHNICITY_CARIBBEAN (BLACK OR BLACK BRITISH)',
       'ETHNICITY_CHINESE (OTHER ETHNIC GROUPS)',
       'ETHNICITY_INDIAN (ASIAN OR ASIAN BRITISH)', 'ETHNICITY_NOT STATED',
       'ETHNICITY_PAKISTANI (ASIAN OR ASIAN BRITISH)',
       'ETHNICITY_WHITE AND BLACK AFRICAN (MIXED)', 'ETHNICIT

## Prepare Dataframes for Output

In [13]:
df_eda = df.copy(deep=True)  # copy df that was worked on for further EDA
# isolate numeric datatypes as finalized preprocessed df that can be used for ML
df = df.select_dtypes(np.number)

## Save Dataframes to Path

In [14]:
df_eda.to_parquet(os.path.join(data_path, "df_eda.parquet"))  # save eda  df
df.to_parquet(os.path.join(data_path, "df.parquet"))  # save preprocessed num. df