In [None]:
import numpy as np
import pandas as pd
import cudf
from preprocessing_tools import drop_missing_train_test
import xgboost as xgb

In [None]:
train = cudf.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
print(train.shape)
train.head()

In [None]:
test = cudf.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)
print(test.shape)
test.head()

## Split x and y data, excluding identifier and response columns

In [None]:
x_train = train.copy().drop(["encounter_id", "diabetes_mellitus"], axis=1)
y_train = train.loc[:, "diabetes_mellitus"]

x_test = test.copy().drop(["encounter_id"], axis=1)

In [None]:
%%time
x_train, x_test = drop_missing_train_test(x_train, x_test)

## Encode categorical columns

In [None]:
categorical_cols = x_train.columns[x_train.dtypes == "object"].values

x_train_categorical = x_train.copy()[categorical_cols]
x_test_categorical = x_test.copy()[categorical_cols]

train_size = x_train.shape[0]
test_size = x_test.shape[0]

categorical = cudf.concat([x_train_categorical, x_test_categorical], axis=0)
categorical = cudf.get_dummies(categorical)

x_train_categorical = categorical.head(train_size)
x_test_categorical = categorical.tail(test_size)

In [None]:
x_train_numerical = x_train.copy().drop(categorical_cols, axis=1)
x_train = cudf.concat([x_train_numerical, x_train_categorical], axis=1)

x_test_numerical = x_test.copy().drop(categorical_cols, axis=1)
x_test = cudf.concat([x_test_numerical, x_test_categorical], axis=1)

## Create DMatrix for xgboost

In [None]:
train_dmatrix = xgb.DMatrix(x_train, label=y_train)
test_dmatrix = xgb.DMatrix(x_test)

## XGBoost model

In [None]:
params = {"objective": "binary:logistic", 
          "max_depth": 20,
          "max_leaves": 15,
          "tree_method": "gpu_hist"}

model = xgb.train(params, train_dmatrix, num_boost_round=120)

In [None]:
pred = model.predict(test_dmatrix)

In [None]:
feature_importance = pd.DataFrame(model.get_score(importance_type='gain').items(), columns=["variable", "score"])
feature_importance.sort_values("score", ascending=False, inplace=True)
feature_importance.head(10)

In [None]:
submission = test.loc[:, ["encounter_id"]]
submission["diabetes_mellitus"] = pred
submission.to_csv("../submissions/baseline_without_missing.csv", index=False)