In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv('Dataset.csv')
df.shape

(240379, 455)

In [13]:
df = df.loc[:, df.isnull().mean() <.20]
df.shape

(240379, 144)

In [14]:
def status(feature):
    print('Processing', feature, ': OK')

In [15]:
def order_cols():
    global df
    cols = list(df.columns.values) #Make a list of all of the columns in the df
    cols.pop(cols.index('trr_id_code')) # Remove trr_id_code (primary key) from list
    cols.pop(cols.index('gstatus_ki')) #Remove gstatus_ki from list
    cols.pop(cols.index('death')) #Remove death from list
    df = df[['trr_id_code']+cols+['gstatus_ki','death']] #Create new dataframe with columns in the order
    status('order_cols')
    
order_cols()

Processing order_cols : OK


In [16]:
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]

column_index(df, ['gstatus_ki', 'death'])

array([142, 143], dtype=int64)

In [17]:
def drop_cols(df):
    drop_cols = ['wl_org', 'rem_cd', 'dayswait_chron', 'end_date', 'init_date', 'wt_qual_date',
                  'init_bmi_calc', 'dayswait_alloc', 'region', 'pri_payment_tcr_ki', 'pri_payment_trr_ki',
                  'citizenship_don', 'cancer_site_don', 'diag_ki', 'organ', 'med_cond_trr', 'payback',
                  'age_group', 'lt_one_week_don', 'data_transplant', 'opo_ctr_code', 'end_opo_ctr_code',
                  'listing_ctr_code', 'citizenship', 'init_opo_ctr_code','death_fail_ki','ctr_code','px_stat_date']
    df.drop(drop_cols, inplace=True, axis=1)
    return df.shape
    status('drop_cols')

drop_cols(df)

(240379, 116)

In [18]:
def drop_rows():
    global df
    df.dropna(how='any', inplace=True, axis=0)
    return df.shape
    status('drop_rows')

drop_rows()

(143615, 116)

In [19]:
def format_funcstat_tcr(df):
    funcstat_cols = ['func_stat_tcr']
    for i in funcstat_cols:
        df[i] = df[i].map({2100:0, 4100:0, 1:0, 998:0, 996:0,
                            2090:1, 4090:1,
                            2080:2, 4080:2,
                            2070:3, 4070:3,
                            2060:4, 4060:4,
                            2050:5, 4050:5, 2:5,
                            2040:6, 4040:6,
                            2030:7,
                            2020:8,
                            2010:9, 
                            3:10})
    status('format_funcstat_tcr')
    
format_funcstat_tcr(df)

Processing format_funcstat_tcr : OK


In [20]:
def format_funcstat_trr(df):
    funcstat_cols = ['func_stat_trr']
    for i in funcstat_cols:
        df[i] = df[i].map({2100:0, 1:0, 996:0, 998:0,
                            2090:1,
                            2080:2,
                            2070:3,
                            2060:4,
                            2050:5, 2:5,
                            2040:6,
                            2030:7,
                            2020:8,
                            2010:9,
                            3:10})
    status('format_funcstat_trr')
    
format_funcstat_trr(df)

Processing format_funcstat_trr : OK


In [21]:
def format_dates(df):
    date_cols = ['admission_date', 'discharge_date']
    for i in date_cols:
        df[i] = pd.to_datetime(df[i])
    status('format_dates')

format_dates(df)

Processing format_dates : OK


In [22]:
def create_days(df):
    df['days'] = df['discharge_date'] - df['admission_date']
    drop_date_cols = ['discharge_date', 'admission_date']
    df.drop(drop_date_cols, inplace=True, axis=1)
    status('create_days')
    
create_days(df)

Processing create_days : OK


In [23]:
def format_yn(df):
    #dwfg_ki
    yn_cols = ['data_waitlist', 'don_retyp', 'donation','first_wk_dial', 'on_dialysis', 'prev_ki_tx', 'prev_tx', 'prev_tx_any']
    ynu_cols = ['diabetes_don', 'dial_trr','drugtrt_copd', 'exh_perit_access', 'exh_vasc_access',
                'hist_cancer_don', 'hist_cig_don', 'hist_hypertens_don', 'malig', 'malig_tcr_ki',
                'malig_trr', 'perip_vasc', 'pre_tx_txfus']
    #grf_stat_ki
    yes_no_u_cols = yn_cols + ynu_cols
    for i in yes_no_u_cols:
        df[i] = df[i].map({'Y':1,'N':0, 'U':0})
    status('format_yn')

format_yn(df)

Processing format_yn : OK


In [24]:
def format_gender(df):
    gender_cols = ['gender', 'gender_don']
    for i in gender_cols:
        df[i] = df[i].map({'M':1, 'F':0})
    status('format_gender')

format_gender(df)

Processing format_gender : OK


In [25]:
df.to_csv(r'C:\Users\agi\Dropbox\dfclean.csv' ,encoding='utf-8')