In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import xgboost as xgb

import wandb
from wandb.lightgbm import wandb_callback

pd.set_option("display.max_columns", 181)
pd.set_option("display.min_rows", 200)

In [22]:
#!pip install xgboost

In [2]:
data_dictionary = pd.read_csv("DataDictionaryWiDS2021.csv")
unlabeled = pd.read_csv("UnlabeledWiDS2021.csv")
training = pd.read_csv("TrainingWiDS2021.csv")

In [3]:
column_datatype_mapping = dict(zip(data_dictionary['Variable Name'], data_dictionary['Data Type']))

In [4]:
del training['Unnamed: 0']
del unlabeled['Unnamed: 0']

In [5]:
all_data = training.drop(['encounter_id', 
                          'hospital_id', 
                          'diabetes_mellitus'], axis=1).append(unlabeled)

In [6]:
cat_cols = []
cont_cols = []
for col in all_data.columns:
    if all_data.dtypes[col] == "object":
        cat_cols.append(col)
        all_data[col] = all_data[col].fillna("NA")
        all_data[col] = LabelEncoder().fit_transform(all_data[col])
        all_data[col]= all_data[col].astype('category')
    elif column_datatype_mapping[col] == "binary":
        all_data[col] = all_data[col].fillna(-1)
    elif column_datatype_mapping[col] == "numeric":
        all_data[col] = all_data[col].fillna(0)
        cont_cols.append(col)
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())
        cont_cols.append(col)

In [7]:
df_train = all_data[:len(training)]
df_pred = all_data[len(training):].reset_index(drop=True)
Y = training['diabetes_mellitus']

In [25]:
X_train, X_val, y_train, y_val = train_test_split(df_train, Y, test_size=0.20, random_state=42,shuffle=True )

In [26]:
xg_train = xgb.DMatrix(X_train.values, label=y_train)
xg_test = xgb.DMatrix(X_val.values, label=y_val)

In [43]:
wandb.init(project="wids_2021", sync_tensorboard=True)
config = wandb.config

param = {'learning_rate':0.01,
          'num_leaves':30,
          'n_estimators':200,
          'eval_metric':'auc',
          'objective': 'reg:logistic'}

wandb.config.update(params)

watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 200
bst = xgb.train(param, xg_train, num_round, watchlist, callbacks=[wandb.xgboost.wandb_callback()])

# get prediction
pred = bst.predict(xg_test)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train-auc,0.90066
test-auc,0.86116
_step,199.0
_runtime,82.0
_timestamp,1610843719.0


0,1
train-auc,▁▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇██████
test-auc,▁▂▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███


[34m[1mwandb[0m: wandb version 0.10.14 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Parameters: { n_estimators, num_leaves } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.81772	test-auc:0.80625
[1]	train-auc:0.81822	test-auc:0.80646
[2]	train-auc:0.82011	test-auc:0.80911
[3]	train-auc:0.82024	test-auc:0.80889
[4]	train-auc:0.82134	test-auc:0.80990
[5]	train-auc:0.82194	test-auc:0.81027
[6]	train-auc:0.82286	test-auc:0.81097
[7]	train-auc:0.82342	test-auc:0.81137
[8]	train-auc:0.82356	test-auc:0.81131
[9]	train-auc:0.82361	test-auc:0.81138
[10]	train-auc:0.82452	test-auc:0.81233
[11]	train-auc:0.82468	test-auc:0.81232
[12]	train-auc:0.82551	test-auc:0.81312
[13]	train-auc:0.82598	test-auc:0.81346
[14]	train-auc:0.82625	test-auc:0.81391
[15]	train-auc:0.82653	test-auc:0.81413
[16]	train-auc:0.82684	test-auc:0.81453
[17]	train-auc:0.82720	test-au

In [44]:
#bst.fit(df_train.values, Y.values)

metrics.roc_auc_score(Y.values, bst.predict(xgb.DMatrix(df_train.values)))

0.8486230924711842

In [None]:
#Basic model final auc: 0.7917882139874278
#0.8263166791535951
#0.8928322559066909


In [45]:
pred_proba = bst.predict(xgb.DMatrix(df_pred.values))

#pred_proba = xgbm.predict_proba(df_pred.values)[:,1]]

In [46]:
submittion = pd.DataFrame([unlabeled.encounter_id,pred_proba]).T
submittion.encounter_id = submittion.encounter_id.astype('int32')
submittion.set_index('encounter_id',inplace=True)
submittion.columns = ['diabetes_mellitus']
submittion.fillna(0.5).to_csv('SolutionWiDS2021_XGB_200_epochs_01_lr.csv')