In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Dataset.csv')
df.shape

(240379, 455)

In [3]:
df = df.loc[:, df.isnull().mean() <.20]
df.shape

(240379, 144)

In [4]:
def status(feature):
    print('Processing', feature, ': OK')

In [5]:
def order_cols():
    global df
    cols = list(df.columns.values) #Make a list of all of the columns in the df
    cols.pop(cols.index('trr_id_code')) # Remove trr_id_code (primary key) from list
    #cols.pop(cols.index('gstatus_ki')) #Remove gstatus_ki from list
    cols.pop(cols.index('death')) #Remove death from list
    df = df[['trr_id_code']+cols+['death']] #Create new dataframe with columns in the order #'gstatus_ki'
    status('order_cols')
    
order_cols()

Processing order_cols : OK


In [6]:
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]

column_index(df, ['gstatus_ki', 'death'])

array([101, 143], dtype=int64)

In [7]:
def drop_cols(df):
    drop_cols = ['wl_org', 'rem_cd', 'dayswait_chron', 'end_date', 'init_date', 'wt_qual_date',
                  'init_bmi_calc', 'dayswait_alloc', 'region', 'pri_payment_tcr_ki', 'pri_payment_trr_ki',
                  'citizenship_don', 'cancer_site_don', 'diag_ki', 'organ', 'med_cond_trr', 'payback',
                  'age_group', 'lt_one_week_don', 'data_transplant', 'opo_ctr_code', 'end_opo_ctr_code',
                  'listing_ctr_code', 'citizenship', 'init_opo_ctr_code','death_fail_ki','ctr_code','px_stat_date', 'tx_date',
                'admission_date', 'discharge_date', 'trr_id_code', 'grf_stat_ki', 'px_stat', 'gstatus_ki', 'pstatus', 'ptime', 'dwfg_ki']
    df.drop(drop_cols, inplace=True, axis=1)
    return df.shape
    status('drop_cols')

drop_cols(df)

(240379, 106)

In [8]:
def drop_rows():
    global df
    df.dropna(how='any', inplace=True, axis=0)
    return df.shape
    status('drop_rows')

drop_rows()

(143647, 106)

In [9]:
def format_funcstat_tcr(df):
    funcstat_cols = ['func_stat_tcr']
    for i in funcstat_cols:
        df[i] = df[i].map({2100:0, 4100:0, 1:0, 998:0, 996:0,
                            2090:1, 4090:1,
                            2080:2, 4080:2,
                            2070:3, 4070:3,
                            2060:4, 4060:4,
                            2050:5, 4050:5, 2:5,
                            2040:6, 4040:6,
                            2030:7,
                            2020:8,
                            2010:9, 
                            3:10})
    status('format_funcstat_tcr')
    
format_funcstat_tcr(df)

Processing format_funcstat_tcr : OK


In [10]:
def format_funcstat_trr(df):
    funcstat_cols = ['func_stat_trr']
    for i in funcstat_cols:
        df[i] = df[i].map({2100:0, 1:0, 996:0, 998:0,
                            2090:1,
                            2080:2,
                            2070:3,
                            2060:4,
                            2050:5, 2:5,
                            2040:6,
                            2030:7,
                            2020:8,
                            2010:9,
                            3:10})
    status('format_funcstat_trr')
    
format_funcstat_trr(df)

Processing format_funcstat_trr : OK


In [11]:
# def format_dates(df):
#     date_cols = ['admission_date', 'discharge_date']
#     for i in date_cols:
#         df[i] = pd.to_datetime(df[i])
#     status('format_dates')

# format_dates(df)

In [12]:
# def create_days(df):
#     df['days'] = df['discharge_date'] - df['admission_date']
#     drop_date_cols = ['discharge_date', 'admission_date']
#     df.drop(drop_date_cols, inplace=True, axis=1)
#     status('create_days')
    
# create_days(df)

In [13]:
# def format_days(df):
#     df['days_total'] = df['days'].str.split('')
#     status('format_days')
    
# format_days(df)

In [14]:
def format_yn(df):
    #dwfg_ki
    yn_cols = ['data_waitlist', 'don_retyp', 'donation','first_wk_dial', 'on_dialysis', 'prev_ki_tx', 'prev_tx', 'prev_tx_any']
    ynu_cols = ['diabetes_don', 'dial_trr','drugtrt_copd', 'exh_perit_access', 'exh_vasc_access',
                'hist_cancer_don', 'hist_cig_don', 'hist_hypertens_don', 'malig', 'malig_tcr_ki',
                'malig_trr', 'perip_vasc', 'pre_tx_txfus']
    #grf_stat_ki
    yes_no_u_cols = yn_cols + ynu_cols
    for i in yes_no_u_cols:
        df[i] = df[i].map({'Y':1,'N':0, 'U':0})
    status('format_yn')

format_yn(df)

Processing format_yn : OK


In [15]:
def format_gender(df):
    gender_cols = ['gender', 'gender_don']
    for i in gender_cols:
        df[i] = df[i].map({'M':1, 'F':0})
    status('format_gender')

format_gender(df)

Processing format_gender : OK


In [16]:
# columns_to_encode = ['abo', 'abo_don', 'cmv_igg', 'cmv_igm', 'cmv_status',
#                      'ebv_serostatus', 'hbv_core', 'hbv_sur_antigen',
#                      'hcv_serostatus', 'hbv_core_don', 'hbv_sur_antigen_don',
#                      'abo_mat', 'diab', 'don_ty', 'education', 'end_stat',
#                      'end_stat_ki', 'ethcat', 'ethcat_don', 'ethnicity', 'px_stat',
#                      'share_ty', 'tx_procedur_ty_ki', 'init_stat', 'txkid']
# encoded_cols = pd.get_dummies(df, columns = columns_to_encode, drop_first=True)
# df.drop(columns_to_encode, inplace=True, axis=1)
# newdf = pd.concat([df, encoded_cols], axis=1)
# newdf.shape

In [17]:
# def encode_columns(df):
#     columns_to_encode = ['abo', 'abo_don', 'cmv_igg', 'cmv_igm', 'cmv_status',
#                      'ebv_serostatus', 'hbv_core', 'hbv_sur_antigen',
#                      'hcv_serostatus', 'hbv_core_don', 'hbv_sur_antigen_don',
#                      'abo_mat', 'diab', 'don_ty', 'education', 'end_stat',
#                      'end_stat_ki', 'ethcat', 'ethcat_don', 'ethnicity', 'px_stat',
#                      'share_ty', 'tx_procedur_ty_ki', 'init_stat', 'txkid']


#     for i in columns_to_encode:
#         encoded_cols = pd.get_dummies(df, columns = columns_to_encode, drop_first =True)
#         df.drop(columns_to_encode, inplace=True, axis=1)
#         newdf = pd.concat([df, encoded_cols], axis = 1)
#         newdf.shape
#         status('encode_columns')
#         return df.shape
# encode_columns(df)

In [18]:
df.shape
pd.Categorical(df.abo).codes

array([7, 6, 7, ..., 7, 6, 0], dtype=int8)

In [19]:
drop_rows()

(143647, 106)

In [20]:
# df.to_csv(r'C:\Users\Agi\Desktop\DropBox\Dropbox\Classes\ITSS 4354\Final Project\chech.csv' , encoding='utf-8')


In [21]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from lightgbm import Dataset

In [22]:
df.head()
categorical_vars = ['abo', 'perm_state', 'txkid', 'hbv_core_don', 'hbv_sur_antigen_don', 'abo_don', 'don_ty',
                    'home_state_don', 'cmv_igg', 'cmv_igm', 'ebv_serostatus', 'hbv_core',
                    'hbv_sur_antigen', 'hcv_serostatus', 'hiv_serostatus', 'cmv_status']
#px_stat

for c in categorical_vars:
    df[c] = pd.Categorical(df[c]).codes

In [23]:
lgb_params = {
    'num_leaves': 7,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbose': 0
}
boost_rounds_form = 80
model_form = LGBMClassifier(**lgb_params, num_boost_round=boost_rounds_form)

In [24]:
df.shape

(143647, 106)

In [25]:
X = df.iloc[:, :-1]
y = df.iloc[:, 105]

In [26]:
#df["death"].value_counts().plot(kind='bar')
#plt.show()

In [27]:
lgb_train = Dataset(data = X, label = y,categorical_feature=['abo', 'perm_state', 'txkid', 'hbv_core_don', 'hbv_sur_antigen_don', 'abo_don', 'don_ty',
                    'home_state_don', 'cmv_igg', 'cmv_igm', 'ebv_serostatus', 'hbv_core',
                    'hbv_sur_antigen', 'hcv_serostatus', 'hiv_serostatus', 'cmv_status'])
cv = lgb.cv(lgb_params, 
              lgb_train, 
              nfold = 10,
              num_boost_round=100, 
              early_stopping_rounds=15,
              stratified=False, 
              verbose_eval=50) 

[50]	cv_agg's binary_logloss: 0.342716 + 0.00304262
[100]	cv_agg's binary_logloss: 0.326651 + 0.00349081


In [28]:
#1 - (sum(cv['l2-mean']) / float(len(cv['l2-mean'])))
cv

{'binary_logloss-mean': [0.6486082711650738,
  0.61102692244361478,
  0.57955728920350358,
  0.55351989683836877,
  0.52977731040322085,
  0.51008693702823193,
  0.49340403192481352,
  0.4779659801030644,
  0.4647134203075064,
  0.45323108550621266,
  0.44313577576493374,
  0.43382418068122808,
  0.42588008034475777,
  0.41839313013558083,
  0.4119145508027649,
  0.40615753553610656,
  0.40100242694285732,
  0.39629471941253341,
  0.3919391063060898,
  0.38790427104633235,
  0.38426025225898269,
  0.38110348102951119,
  0.37835360257895445,
  0.37554099996661172,
  0.37309571343717446,
  0.37091789606334002,
  0.36857918662159384,
  0.3664794901880285,
  0.36449941923032986,
  0.36294987893277092,
  0.36118412857926235,
  0.35953412900140774,
  0.35821942199994994,
  0.35678045855359186,
  0.35553689860701454,
  0.35434990814539025,
  0.35334203619131582,
  0.35226691228083928,
  0.35120239227625716,
  0.35030742575333162,
  0.34933636448558347,
  0.34849291171472829,
  0.3475421878356

In [29]:
#(sum(cv['l2-stdv']) / float(len(cv['l2-stdv'])))

In [30]:
# params = {
#     'boosting_type': 'dart', #gbdt
#     'objective': 'binary',
#     'metric': 'binary_logloss',
#     'num_leaves': 11,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'verbose': 0
# }
params = { 'boosting_type': 'gbdt', 'objective': 'binary','metric': 'binary_logloss',
'bagging_fraction': 1.0, 'colsample_bytree': 1.0, 'feature_fraction': 1.0, 'max_depth': 11, 
          'min_child_weight': 0.001, 'min_split_gain': 0.10000000000000001, 'n_estimators': 259, 'subsample': 1.0}

boost_rounds_form = 1000
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

lgb_train_form = lgb.Dataset(X_train, y_train)
gbm_form = lgb.train(
                params,
                lgb_train_form,
                valid_sets=lgb_train_form,
                num_boost_round=boost_rounds_form)

[1]	training's binary_logloss: 0.637882
[2]	training's binary_logloss: 0.592695
[3]	training's binary_logloss: 0.554653
[4]	training's binary_logloss: 0.522649
[5]	training's binary_logloss: 0.495198
[6]	training's binary_logloss: 0.471861
[7]	training's binary_logloss: 0.451286
[8]	training's binary_logloss: 0.433512
[9]	training's binary_logloss: 0.418133
[10]	training's binary_logloss: 0.40481
[11]	training's binary_logloss: 0.392995
[12]	training's binary_logloss: 0.382701
[13]	training's binary_logloss: 0.373687
[14]	training's binary_logloss: 0.365748
[15]	training's binary_logloss: 0.358512
[16]	training's binary_logloss: 0.352288
[17]	training's binary_logloss: 0.346777
[18]	training's binary_logloss: 0.341774
[19]	training's binary_logloss: 0.337071
[20]	training's binary_logloss: 0.332913
[21]	training's binary_logloss: 0.329211
[22]	training's binary_logloss: 0.326193
[23]	training's binary_logloss: 0.323062
[24]	training's binary_logloss: 0.32046
[25]	training's binary_logl

In [31]:
preds = gbm_form.predict(data=X_test, raw_score=True)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

for i in range(0,len(preds)):
    if preds[i]>=.5:       # setting threshold to .5
        preds[i]=1
    else:  
        preds[i]=0
preds

confusion_matrix =confusion_matrix(y_test, preds)
confusion_matrix
list1 = ["Actual Survived", "Actual Died"]
list2 = ["Predicted Survived", "Predicted Died"]

In [32]:
pd.DataFrame(confusion_matrix, list1, list2)

Unnamed: 0,Predicted Survived,Predicted Died
Actual Survived,34811,1458
Actual Died,4876,6259


In [33]:
accuracy_score(y_test,preds)

0.86638258374820687

In [34]:
[e for e in zip((list(gbm_form.feature_importance("split"))), list(df.iloc[:, :-1]))]

[(50, 'num_prev_tx'),
 (1, 'donation'),
 (77, 'on_dialysis'),
 (227, 'a1'),
 (417, 'a2'),
 (371, 'b1'),
 (470, 'b2'),
 (303, 'dr1'),
 (348, 'dr2'),
 (45, 'gender'),
 (130, 'abo'),
 (498, 'wgt_kg_tcr'),
 (318, 'hgt_cm_tcr'),
 (561, 'bmi_tcr'),
 (787, 'perm_state'),
 (278, 'education'),
 (209, 'func_stat_tcr'),
 (551, 'dgn_tcr'),
 (217, 'diab'),
 (27, 'drugtrt_copd'),
 (11, 'init_stat'),
 (308, 'init_wgt_kg'),
 (204, 'init_hgt_cm'),
 (25, 'end_stat'),
 (654, 'init_age'),
 (13, 'ethnicity'),
 (224, 'ethcat'),
 (476, 'end_bmi_calc'),
 (63, 'perip_vasc'),
 (3, 'exh_perit_access'),
 (2, 'exh_vasc_access'),
 (19, 'malig_tcr_ki'),
 (5, 'prev_tx'),
 (0, 'prev_ki_tx'),
 (279, 'func_stat_trr'),
 (1, 'malig_trr'),
 (985, 'creat_trr'),
 (36, 'first_wk_dial'),
 (964, 'serum_creat'),
 (45, 'pre_tx_txfus'),
 (39, 'txkid'),
 (18, 'don_retyp'),
 (334, 'da1'),
 (514, 'da2'),
 (544, 'db1'),
 (664, 'db2'),
 (414, 'ddr1'),
 (440, 'ddr2'),
 (118, 'ra1'),
 (383, 'ra2'),
 (260, 'rb1'),
 (525, 'rb2'),
 (222, 'r

In [35]:
for c in categorical_vars:
    df[c] = pd.Categorical(df[c]).codes

In [36]:
df['death'].value_counts()
# 1 there was no graft complication
# 2 there was a graft complication

0    109514
1     34133
Name: death, dtype: int64

In [37]:
# df['px_stat'].head()

In [38]:
from skopt import BayesSearchCV

In [39]:
X_full = X
y_form_full = y

In [40]:
params = {
    'max_depth': (3, 15),
    'min_child_weight': (1e-3, 1e+3),
    'n_estimators': (1, 300),
    'colsample_bytree': (1e-1, 1e+0),
    'subsample': (0.4, 1),
    'bagging_fraction': (0.5, 1),
    'feature_fraction': (0.5, 1),
    'min_split_gain': (0.1, 10),
}

In [41]:
opt_form = BayesSearchCV(
    lgb.LGBMRegressor(boosting_type='gbdt', objective='binary', metric='binary_loglos', categorical_feature=0),
    params,
    n_iter=100,
    n_jobs=4
)
opt_form.fit(X_full, y_form_full)

print("val. score: %s" % opt_form.best_score_)
print(opt_form.best_params_)

val. score: 0.39904213386
{'bagging_fraction': 1.0, 'colsample_bytree': 0.10000000000000001, 'feature_fraction': 1.0, 'max_depth': 7, 'min_child_weight': 0.001, 'min_split_gain': 0.10000000000000001, 'n_estimators': 300, 'subsample': 1.0}


In [42]:
# column_index(df, ['abo', 'perm_state', 'txkid', 'hbv_core_don', 'hbv_sur_antigen_don', 'abo_don', 'don_ty',
#                     'home_state_don', 'cmv_igg', 'cmv_igm', 'ebv_serostatus', 'hbv_core',
#                     'hbv_sur_antigen', 'hcv_serostatus', 'hiv_serostatus', 'cmv_status'])