In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from tools import make_submission

In [None]:
DATA_PATH = "../data/data_remove_redundant.pkl"
SUB_FILE = "xgb_remove_redundant_weighted.csv"

TEST_PATH = "../data/UnlabeledWiDS2021.csv"
SUBMISSION_PATH = "../submissions"

## Loading data

In [None]:
with open(DATA_PATH, "rb") as f:
    data = pickle.load(f)

x_train_val = data["x_train_val"]
y_train_val = data["y_train_val"]
x_test = data["x_test"]

x_train = data["x_train"]
y_train = data["y_train"]
x_val = data["x_val"]
y_val = data["y_val"]

In [None]:
train_val_dmatrix = xgb.DMatrix(x_train_val, label=y_train_val)
test_dmatrix = xgb.DMatrix(x_test)

train_dmatrix = xgb.DMatrix(x_train, label=y_train)
val_dmatrix = xgb.DMatrix(x_val, label=y_val)

## Training in train/validation data

In [None]:
params = {"objective": "binary:logistic",
          "max_depth": 20,
          "max_leaves": 15,
          "eval_metric": ["auc", "logloss"],
          "tree_method": "gpu_hist"}
N_TREES = 100

evals = [(train_dmatrix, 'train'), (val_dmatrix, 'val')]
eval_history = {}

model = xgb.train(params, train_dmatrix, num_boost_round=N_TREES, evals=evals, evals_result=eval_history, verbose_eval=False)

In [None]:
val_pred = model.predict(val_dmatrix)
print("AUC:", roc_auc_score(y_val, val_pred))

n_epochs = len(eval_history["train"]["auc"])
epochs = range(0, n_epochs)

fig, ax = plt.subplots()
ax.plot(epochs, eval_history['train']['auc'], label='Train')
ax.plot(epochs, eval_history['val']['auc'], label='Val')
ax.legend()
plt.title('auc')
plt.show()

fig, ax = plt.subplots()
ax.plot(epochs, eval_history['train']['logloss'], label='Train')
ax.plot(epochs, eval_history['val']['logloss'], label='Val')
ax.legend()
plt.title('logloss')
plt.show()

## Class Weights

In [None]:
class_weights = compute_class_weight("balanced", np.unique(y_train_val), y_train_val)
df_weights = pd.DataFrame(class_weights, columns=["weight"])
df_weights["diabetes_mellitus"] = [0, 1]
df_weights

In [None]:
w_train_val = pd.DataFrame(y_train_val).merge(df_weights, on="diabetes_mellitus")["weight"]
w_train = pd.DataFrame(y_train).merge(df_weights, on="diabetes_mellitus")["weight"]
w_val = pd.DataFrame(y_val).merge(df_weights, on="diabetes_mellitus")["weight"]

In [None]:
w_train_val_dmatrix = xgb.DMatrix(x_train_val, label=y_train_val, weight=w_train_val)
w_test_dmatrix = xgb.DMatrix(x_test)

w_train_dmatrix = xgb.DMatrix(x_train, label=y_train, weight=w_train)
w_val_dmatrix = xgb.DMatrix(x_val, label=y_val, weight=w_val)

In [None]:
params = {"objective": "binary:logistic",
          "max_depth": 20,
          "max_leaves": 15,
          "min_child_weight": 5,
          "gamma": 5,
          "eval_metric": ["auc", "logloss"],
          "tree_method": "gpu_hist"}
N_TREES = 100

evals = [(w_train_dmatrix, 'train'), (w_val_dmatrix, 'val')]
eval_history = {}

model = xgb.train(params, w_train_dmatrix, num_boost_round=N_TREES, evals=evals, evals_result=eval_history, verbose_eval=False)

In [None]:
val_pred = model.predict(val_dmatrix)
print("AUC:", roc_auc_score(y_val, val_pred))

n_epochs = len(eval_history["train"]["auc"])
epochs = range(0, n_epochs)

fig, ax = plt.subplots()
ax.plot(epochs, eval_history['train']['auc'], label='Train')
ax.plot(epochs, eval_history['val']['auc'], label='Val')
ax.legend()
plt.title('auc')
plt.show()

fig, ax = plt.subplots()
ax.plot(epochs, eval_history['train']['logloss'], label='Train')
ax.plot(epochs, eval_history['val']['logloss'], label='Val')
ax.legend()
plt.title('logloss')
plt.show()

## Feature Importance

In [None]:
feature_importance = model.get_score(importance_type='gain')
keys = list(feature_importance.keys())
values = list(feature_importance.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.head(15)

## Training with all data

In [None]:
model = xgb.train(params, train_val_dmatrix, num_boost_round=N_TREES)

test_pred = model.predict(test_dmatrix)

In [None]:
sub_name = os.path.join(SUBMISSION_PATH, SUB_FILE)
make_submission(test_pred, TEST_PATH, sub_name)