In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn import metrics
import lightgbm as lgb

pd.set_option("display.max_columns", 181)
pd.set_option("display.min_rows", 200)

In [6]:
data_dictionary = pd.read_csv("DataDictionaryWiDS2021.csv",index_col=0)
unlabeled = pd.read_csv("UnlabeledWiDS2021.csv",index_col=0)
training = pd.read_csv("TrainingWiDS2021.csv",index_col=0)

column_datatype_mapping = dict(zip(data_dictionary['Variable Name'], data_dictionary['Data Type']))

In [7]:
all_data = training.drop(['encounter_id', 
                          'hospital_id', 
                          'diabetes_mellitus'], axis=1).append(unlabeled)

In [9]:
cat_cols = []
cont_cols = []
for col in all_data.columns:
    if all_data.dtypes[col] == "object":
        cat_cols.append(col)
        all_data[col] = all_data[col].fillna("NA")
        all_data[col] = LabelEncoder().fit_transform(all_data[col])
        all_data[col]= all_data[col].astype('category')
    elif column_datatype_mapping[col] == "binary":
        all_data[col] = all_data[col].fillna(-1)
    elif column_datatype_mapping[col] == "numeric":
        all_data[col] = all_data[col].fillna(0)
        cont_cols.append(col)
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())
        cont_cols.append(col)

In [55]:
X = all_data[:len(training)][0:120000]
y = training['diabetes_mellitus'][0:120000]

X_holdout = all_data[:len(training)][120000:]
y_holdout =  training['diabetes_mellitus'][120000:]

X_all = all_data[:len(training)]
y_all = training['diabetes_mellitus']
X_pred = all_data[len(training):].reset_index(drop=True)

In [42]:
# Splitting 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                      test_size=0.20, random_state=42,shuffle=True) 

In [59]:
import wandb
from wandb.lightgbm import wandb_callback

wandb.init(project="wids_2021", sync_tensorboard=True)
config = wandb.config

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
val_auc,0.86796
_step,310.0
_runtime,20.0
_timestamp,1610824985.0


0,1
val_auc,▁▂▃▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇███████████████████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█
_timestamp,▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█


[34m[1mwandb[0m: wandb version 0.10.14 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [60]:
params = {'learning_rate':0.05,
          'num_leaves':30,
          'n_estimators':2000,
          'metric': 'auc',
          'objective': 'binary',
          'scale_pos_weight': 2}

watchlist = [(X_train, 'train'), (X_test, 'test')]
num_round = 5

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [61]:
# train

wandb.config.update(params)

# add lightgbm callback
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                valid_names=('val'),
                callbacks=[wandb_callback()],
                early_stopping_rounds=10)

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)



[LightGBM] [Info] Number of positive: 20755, number of negative: 75245
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25346
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 176




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216198 -> initscore=-1.287962
[LightGBM] [Info] Start training from score -1.287962
[1]	val's auc: 0.80543
Training until validation scores don't improve for 10 rounds
[2]	val's auc: 0.810281
[3]	val's auc: 0.8163
[4]	val's auc: 0.817855
[5]	val's auc: 0.819504
[6]	val's auc: 0.821013
[7]	val's auc: 0.822553
[8]	val's auc: 0.823887
[9]	val's auc: 0.824975
[10]	val's auc: 0.825797
[11]	val's auc: 0.826827
[12]	val's auc: 0.827457
[13]	val's auc: 0.827934
[14]	val's auc: 0.828601
[15]	val's auc: 0.829
[16]	val's auc: 0.829645
[17]	val's auc: 0.830169
[18]	val's auc: 0.830595
[19]	val's auc: 0.831164
[20]	val's auc: 0.831771
[21]	val's auc: 0.832588
[22]	val's auc: 0.833339
[23]	val's auc: 0.833997
[24]	val's auc: 0.834489
[25]	val's auc: 0.834955
[26]	val's auc: 0.835285
[27]	val's auc: 0.835653
[28]	val's auc: 0.83634
[29]	val's auc: 0.836849
[30]	val's auc: 0.83722
[31]	val's auc: 0.837564
[32]	val's auc: 0.83795
[33]	val's auc: 0.83838

In [62]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, y_pred))

0.8679669913339565


In [63]:
y_pred_holdout = gbm.predict(X_holdout, num_iteration=gbm.best_iteration)

print(roc_auc_score(y_holdout, y_pred_holdout))

0.8620574119227021


In [84]:
lgbmc = LGBMClassifier(n_estimators=500,learning_rate=0.05)
lgbmc.fit(X_train, y_train)

print('test: ',lgbmc.score(X_test, y_test))
print('holdout: ',lgbmc.score(X_holdout, y_holdout))

test:  0.8422083333333333
holdout:  0.8271143054051393


In [None]:
'''default setting:
    test:  0.840125
    holdout:  0.8280988480850645
    all train set: 0.8768157958832125
'''

In [85]:
lgbmc.fit(X_all, y_all)

AUC_FINAL=metrics.roc_auc_score(y.values, lgbmc.predict(X))
AUC_FINAL
#lgbmc.predict_proba(X_pred)[:,1]

0.761173517800555

In [72]:
submittion = pd.DataFrame([unlabeled.encounter_id,lgbmc.predict_proba(X_pred)[:,1]]).T#.set_index('encounter_id')
submittion.encounter_id = submittion.encounter_id.astype('int32')
submittion.set_index('encounter_id',inplace=True)
submittion.columns = ['diabetes_mellitus']
submittion.fillna(0.5).to_csv('SolutionWiDS2021_LightGBM.csv')

In [73]:
submittion[submittion['diabetes_mellitus'].isna()]

Unnamed: 0_level_0,diabetes_mellitus
encounter_id,Unnamed: 1_level_1
136852,
