In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import MinMaxScaler
from numpy import NaN
import numpy as np
import os


In [2]:
# if resetting my working directory is necessary, use this to check my current directory
# os.getcwd()

# set directory on google drive to where UNOS raw files are
from google.colab import drive
drive.mount('/content/drive')
os.chdir('./drive/MyDrive/kidney')

Mounted at /content/drive


In [3]:
# read in .dat and .htm (both are UNOS raw files)

# getting the column naems
kidpan_htm = pd.read_html('KIDPAN_DATA.htm')
label = kidpan_htm[0]['LABEL']

# gettting data
kidpan_data_org = pd.read_csv('KIDPAN_DATA.DAT', header=None, sep='\t', low_memory=False)

In [4]:
# use label to set column names for kidpan_data_org
kidpan_data_org.columns = label

# filling in '.' with NaN
kidpan_data_org = kidpan_data_org.replace('.', NaN)

# reset index
kidpan_data_org.index += 1

In [5]:
# check dim of the data. should be (1097058, 491)
kidpan_data_org.shape

(1097058, 491)

In [6]:
# Preview of dataframe
kidpan_data_org.head()

LABEL,WL_ORG,COD_WL,COD_OSTXT_WL,NUM_PREV_TX,CURRENT_PRA,PEAK_PRA,USE_WHICH_PRA,CREAT_CLEAR,GFR,DONATION,...,INO_PROCURE_OSTXT_1,INO_PROCURE_OSTXT_2,INO_PROCURE_OSTXT_3,DATA_TRANSPLANT,DATA_WAITLIST,CTR_CODE,OPO_CTR_CODE,INIT_OPO_CTR_CODE,END_OPO_CTR_CODE,LISTING_CTR_CODE
1,PA,,,0,,,,,,,...,,,,N,Y,Unknown,Unknown,14353,14353,24800
2,KI,,,0,0.0,0.0,C,,,N,...,,,,N,Y,Unknown,Unknown,3658,3658,18259
3,KI,,,0,0.0,0.0,C,,,N,...,,,,Y,Y,18259,03658,3658,3658,18259
4,KI,,,1,3.0,3.0,C,,,N,...,,,,N,Y,Unknown,Unknown,3658,3658,18259
5,KI,,,1,5.0,8.0,C,,,N,...,,,,Y,Y,18259,03658,3658,3658,18259


In [7]:
# proceed with KI transplant rows only. since data is combination of KI and PA. PA stands for Pancreas.
kidpan_data_org = kidpan_data_org[kidpan_data_org['WL_ORG'] == 'KI']

In [8]:
# check dim of the data. should be (982456, 491)
kidpan_data_org.shape

(982456, 491)

In [12]:
# select variables that will be used for analysis. selection reasoning will be further explained in thesis.
kidpan_data_org_selected = kidpan_data_org[[ 'ABO_MAT', 'AGE', 'AGE_DON', 'AMIS', 'BMI_DON_CALC', 'BMI_TCR', 'BMIS', 'CMV_STATUS', 'COLD_ISCH_KI', 'CURRENT_PRA', 'DAYSWAIT_ALLOC', 'DAYSWAIT_CHRON',
            'DAYSWAIT_CHRON_KI', 'DIAB', 'DIAL_TRR', 'DON_TY', 'DONATION', 'DRMIS', 'DRUGTRT_COPD',
            'DWFG_KI', 'EDUCATION', 'END_CPRA', 'ETHCAT', 'ETHCAT_DON', 'EXH_PERIT_ACCESS', 'EXH_VASC_ACCESS', 'FIRST_WK_DIAL', 'GENDER', 'GENDER_DON',
            'GSTATUS_KI', 'GTIME_KI', 'HIST_CANCER_DON', 'HIST_HYPERTENS_DON', 'HIV_SEROSTATUS', 'HLAMIS', 'KDPI', 'KDRI_MED', 'LOS', 'MALIG_TCR_KI', 'NON_HRT_DON',
            'NUM_PREV_TX', 'ON_DIALYSIS', 'PERIP_VASC', 'PT_STEROIDS_DON', 'PUMP_KI', 'RT_KI_BIOPSY', 'SERUM_CREAT', 'SHARE_TY', 'TATTOOS', 'TBILI_DON', 'TOT_SERUM_ALBUM',
            'TRTREJ1Y_KI', 'TRTREJ6M_KI', 'TX_PROCEDUR_TY_KI', 'TXKID', 'URINE_INF_DON', 'USE_WHICH_PRA', 'VASODIL_DON', 'WORK_INCOME_TCR']]

In [14]:
# assuming "missing at random assumption" https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7145010/#R12
kidpan_data_org_selected = kidpan_data_org_selected.dropna()

In [15]:
kidpan_data_org_selected.shape

(8566, 59)

In [37]:
kidpan_data_org_selected.to_csv('./kidney/kidpan_data_org_selected.csv')