In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn import metrics
import xgboost as xgb

pd.set_option("display.max_columns", 181)
pd.set_option("display.min_rows", 200)

In [22]:
#!pip install xgboost

In [2]:
data_dictionary = pd.read_csv("DataDictionaryWiDS2021.csv")
unlabeled = pd.read_csv("UnlabeledWiDS2021.csv")
training = pd.read_csv("TrainingWiDS2021.csv")

In [3]:
column_datatype_mapping = dict(zip(data_dictionary['Variable Name'], data_dictionary['Data Type']))

In [4]:
del training['Unnamed: 0']
del unlabeled['Unnamed: 0']

In [5]:
all_data = training.drop(['encounter_id', 
                          'hospital_id', 
                          'diabetes_mellitus'], axis=1).append(unlabeled)

In [6]:
cat_cols = []
cont_cols = []
for col in all_data.columns:
    if all_data.dtypes[col] == "object":
        cat_cols.append(col)
        all_data[col] = all_data[col].fillna("NA")
        all_data[col] = LabelEncoder().fit_transform(all_data[col])
        all_data[col]= all_data[col].astype('category')
    elif column_datatype_mapping[col] == "binary":
        all_data[col] = all_data[col].fillna(-1)
    elif column_datatype_mapping[col] == "numeric":
        all_data[col] = all_data[col].fillna(0)
        cont_cols.append(col)
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())
        cont_cols.append(col)

In [7]:
df_train = all_data[:len(training)]
df_pred = all_data[len(training):].reset_index(drop=True)
Y = training['diabetes_mellitus']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(df_train, Y, test_size=0.20, random_state=42,shuffle=True )

In [18]:
lgbmc = LGBMClassifier()
lgbmc.fit(X_train, y_train)
lgbmc.score(X_val, y_val)

0.8400814382298709

In [19]:
lgbmc.fit(df_train, Y)
AUC_FINAL=metrics.roc_auc_score(Y.values, lgbmc.predict(df_train))
AUC_FINAL

0.727008908307974

In [30]:
xgbm = xgb.XGBClassifier()
xgbm.fit(X_train.values, y_train.values, eval_metric='auc')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [35]:
xgbm.fit(df_train.values, Y.values)
metrics.roc_auc_score(Y.values, xgbm.predict(df_train.values))





0.7917882139874278

In [38]:
#submittion = pd.DataFrame([unlabeled.encounter_id,lgbmc.predict_proba(df_pred)[:,1]]).T

submittion = pd.DataFrame([unlabeled.encounter_id,xgbm.predict_proba(df_pred.values)[:,1]]).T
submittion.encounter_id = submittion.encounter_id.astype('int32')
submittion.set_index('encounter_id',inplace=True)
submittion.columns = ['diabetes_mellitus']
submittion.fillna(0.5).to_csv('SolutionWiDS2021_without_XGB.csv')