In [1]:
from google.colab import drive
drive.mount("/content/drive") # Mount Google Drive

import sys # Add virtual env's site-packages to sys.path
sys.path.append('/content/drive/MyDrive/ckd_env/lib/python3.10/site-packages')

# Change working directory
%cd '/content/drive/MyDrive/kidney_uae/'

# Prepare activation script for virtual env
!echo "source /content/drive/MyDrive/ckd_env/bin/activate" > activate.sh

# Make scripts and binaries executable
!chmod +x activate.sh
!chmod +x /content/drive/MyDrive/ckd_env/bin/python
!chmod +x /content/drive/MyDrive/ckd_env/bin/pip

%env BASH_ENV=activate.sh # Set BASH_ENV to activate virtual env

print() # New line for clarity
!python --version # Check Python version


Mounted at /content/drive
/content/drive/MyDrive/kidney_uae
env: BASH_ENV=activate.sh # Set BASH_ENV to activate virtual env

Python 3.10.12


# Preprocessing: Kidney_UAE

## Load Requisite Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split

from functions import *  # import custom functions

## Read File From Path and Explore Basic Structure

In [3]:
# Change directory to where functions.py is located if it's not in '/content'
data_path = "/content/drive/MyDrive/kidney_uae/data"

In [4]:
# read in the data from an excel file
df = pd.read_excel(os.path.join(data_path, "kidney_uae.xlsx")).set_index("id")

In [5]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns in this dataset.")

There are 1186 rows and 11 columns in this dataset.


In [6]:
df.head()  # print first 5 rows of dataframe

Unnamed: 0_level_0,sex,cancer,smoking,obesity,hypertension,dyslipidemia,diabetes,cardiovascular_disease,creatnine,outcome,time(months)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,0,0,1,1,1,0,0,59.0,0,116
2,0,0,0,1,1,1,0,0,52.0,0,116
3,0,0,0,1,1,1,0,0,57.0,0,101
4,0,0,0,1,0,1,0,0,65.0,0,117
5,0,0,0,1,1,1,1,0,70.0,0,119


## Reorder and Rename Columns

In [7]:
# Shift column 'time(months)' one place to the left
df = move_column_before(
    df=df,
    target_column="time(months)",
    before_column="sex",
)

In [8]:
print(f"New order of columns: {df.columns.to_list()}")  # list new order of cols

New order of columns: ['time(months)', 'sex', 'cancer', 'smoking', 'obesity', 'hypertension', 'dyslipidemia', 'diabetes', 'cardiovascular_disease', 'creatnine', 'outcome']


In [9]:
# rename the following colnames: time(months), creatnine
df.rename(
    columns={"time(months)": "time_months", "creatnine": "creatinine"},
    inplace=True,
)

## Create EDA Dataset

In [10]:
df_eda = df.copy(deep=True) # create new dataframe specifically for EDA
df_eda["time_years"] = round(df_eda["time_months"] / 12, 1)

In [11]:
# Define bins so that there's a clear bin for > 10 up to max
# (and potentially slightly beyond)
# Note: The last bin captures all values from 10.0 up to and including max and
# slightly beyond, if necessary
year_bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, float("inf")]
year_labels = [
    "0-1_years",
    "1-2_years",
    "2-3_years",
    "3-4_years",
    "4-5_years",
    "5-6_years",
    "6-7_years",
    "7-8_years",
    "8-9_years",
    "9-10_years",
    "10_years_plus",
]

# Apply the binning
df_eda["year_bins"] = pd.cut(
    df_eda["time_years"],
    bins=year_bins,
    labels=year_labels,
    include_lowest=True,
    right=True,
)

In [12]:
# create separate dataframe for expanded modeling with one-hot-encoded year bins
df_years = (
    df_eda.copy(deep=True)
    .assign(**pd.get_dummies(df_eda["year_bins"]))
    .drop(columns=["time_months", "time_years", "year_bins"])
)

## Split the Data and Export Datasets to Path

In [13]:
# Dictionary with the data frame names as keys and the data frames as values
model_frames = {"df_original": df, "df_years": df_years, "df_eda": df_eda}
base_output_dir = data_path  # Base directory to save the splits

########################### Stratification parameters ##########################
stratify_years = [col for col in df_years.columns if "_years" in col]
stratify_regular = ["sex"]
################################################################################

for frame_name, frame_data in model_frames.items():
    # Independent variables, excluding 'outcome'
    X = frame_data[[col for col in frame_data.columns if col != "outcome"]]
    # Dependent variable
    y = frame_data["outcome"]

    # if original dataframe, stratify by 'sex', otherwise, stratify by 'years'
    if frame_name == "df_original":
        stratify_by = frame_data[stratify_regular]
    elif frame_name == "df_years":
        stratify_by = frame_data[stratify_years]
    else:
        stratify_by = None

    # Train-test split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        stratify=stratify_by,
        random_state=222,
    )

    # Directory for this data frame's splits
    output_dir = os.path.join(base_output_dir, frame_name)
    os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists
    frame_data.to_parquet(
        os.path.join(output_dir, f"{frame_name}.parquet")
    )  # export out EDA dataset

    # Check to only save splits if not working with df_eda
    if frame_name != "df_eda":
        dataset_dict = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test,
        }

        # save out X_train, X_test, y_train, y_test to appropriate path(s)
        for name, item in dataset_dict.items():
            file_path = os.path.join(
                output_dir, f"{name}.parquet"
            )  # Correctly define the file path
            if not isinstance(item, pd.DataFrame):
                item.to_frame(name="outcome").to_parquet(
                    file_path
                )  # Convert Series to DataFrame and save
            else:
                item.to_parquet(file_path)  # Save DataFrame directly

    # Check if the DataFrame is not 'df_eda', then save the joined X_train,
    # y_train, and X_test, y_test DataFrames
    if frame_name != "df_eda":
        train_data = X_train.join(y_train, on="id", how="inner")
        test_data = X_test.join(y_test, on="id", how="inner")
        train_data.to_parquet(os.path.join(output_dir, "df_train.parquet"))
        test_data.to_parquet(os.path.join(output_dir, "df_test.parquet"))

In [14]:
print(f"Training Size = {X_train.shape[0]}")
print(f"Test Size = {X_test.shape[0]}")
print()
print(
    f"Training Percentage = {X_train.shape[0] / (X_train.shape[0] + X_test.shape[0])*100:.0f}%"
)
print(
    f"Test Percentage = {X_test.shape[0] / (X_train.shape[0] + X_test.shape[0])*100:.0f}%"
)

Training Size = 948
Test Size = 238

Training Percentage = 80%
Test Percentage = 20%


## References

Al-Shamsi, S., Govender, R. D., & King, J. (2021). Predictive value of creatinine-based equations of kidney function in the long-term prognosis of United Arab Emirates patients with vascular risk. *Oman medical journal, 36*(1), e217. https://doi.org/10.5001/omj.2021.07


Al-Shamsi, S., Govender, R. D., & King, J. (2019). Predictive value of creatinine-based equations of kidney function in the long-term prognosis of United Arab Emirates patients with vascular risk [Dataset]. Mendeley Data, V1. https://data.mendeley.com/datasets/ppfwfpprbc/1



