In [None]:
from utils import spark_session

session = spark_session("purchase-suppression-tmp")

# Train/eval model

In [None]:
train_df = session.table("bx_ps_features_train").toPandas()
eval_df = session.table("bx_ps_features_valid").toPandas()

In [None]:
import xgboost as xgb
def train_eval_model(hyperp, feature_list, label, train_df, eval_df, verbose=True):
    model = xgb.XGBClassifier(  
        **hyperp
    )
    model = model.fit(train_df[feature_list], train_df[label], verbose=verbose)
    eval_df['preds'] = model.predict(eval_df[feature_list])
    return model, eval_df

feature_list = [f for f in train_df.columns if f.startswith("f_")]

hyperp = {
    'tree_method': 'hist',
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'random_state': 42, 
    'learning_rate': 0.01,
    'colsample_bytree': 0.5, 
    'eta': 0.05, 
    'max_depth': 12,
    'n_estimators': 500,
    'subsample': 0.75,
    'lambda': 100
}

model, eval_df = train_eval_model(hyperp, feature_list, "label", train_df, eval_df)

# Metrics

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, classification_report

y_test, y_pred = eval_df['label'], eval_df['preds']
roc_auc = roc_auc_score(y_test, y_pred)
precision_micro = precision_score(y_test, y_pred, average='micro')
recall_micro = recall_score(y_test, y_pred, average='micro')
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')

label_counts = eval_df['label'].value_counts()
print("Label counts:")
print(label_counts)

label_counts = eval_df['label_type'].value_counts()
print("Label type counts:")
print(label_counts)

print("Metrics:")
print(f'ROC AUC: {roc_auc}')
print(f'Precision (micro): {precision_micro}')
print(f'Recall (micro): {recall_micro}')
print(f'Precision (macro): {precision_macro}')
print(f'Recall (macro): {recall_macro}')
report = classification_report(y_test, y_pred, target_names=['0', '1'])
print(report)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Feature analysis

In [None]:
# feature importance
import numpy as np
print(f"Feature importance of ranker:")
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
feature_order = []
for f in range(train_df.shape[1]):
    if f < len(indices):
        print(
            "%2d) %-*s %f" % (f + 1, 30, feature_list[indices[f]], importances[indices[f]])
        )
        feature_order.append(feature_list[indices[f]])

In [None]:
# leakage analysis
# correlations to detect features with perfect correlation to labels
top_30_features = feature_order[0:30]
correlations = train_df[top_30_features + ['label']].corr()
label_correlations = correlations['label']

print(label_correlations)

# ROC analysis - fitting a model on a single feature at a time
print("\nROC Analysis")
for i, feature_name in enumerate(top_30_features):
    single_feature = [feature_name]
    single_feature_model, eval_df = train_eval_model(hyperp, single_feature, "label", train_df, eval_df)
    print(f"Feature: {feature_name} ({label_correlations.iloc[i]})")
    y_test, y_pred = eval_df['label'], eval_df['preds']
    roc_auc = roc_auc_score(y_test, y_pred)
    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')

    label_counts = eval_df['label'].value_counts()
    print("Label counts:")
    print(label_counts)

    label_counts = eval_df['label_type'].value_counts()
    print("Label type counts:")
    print(label_counts)

    print("Metrics:")
    print(f'ROC AUC: {roc_auc}')
    print(f'Precision (micro): {precision_micro}')
    print(f'Recall (micro): {recall_micro}')
    print(f'Precision (macro): {precision_macro}')
    print(f'Recall (macro): {recall_macro}')
    report = classification_report(y_test, y_pred, target_names=['0', '1'])
    print(report)
    print("****************************************************************\n")
    

In [None]:
# feature distributions
import matplotlib.pyplot as plt

top_30_features = feature_order[0:30]
for i, feature_name in enumerate(top_30_features):
    train_df[feature_name].hist(bins=50)
    plt.title(feature_name)
    plt.xlabel('Feature Values')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
if session:
    session.stop()