# HDS (Historical Data Set) Analysis with XGBoost

This notebook demonstrates how to work with a sample HDS dataset, explore its structure, and build a simple XGBoost model to predict outcomes.

In [None]:
# Import necessary libraries
import polars as pl
import polars.selectors as cs
import plotly.express as px
import plotly.graph_objects as go
import zipfile
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import great_tables as gt
from pdstools.utils import cdh_utils

## Data Read

Read the HDS data. Depending on how the data was extracted and stored, you may need different reading methods using
zipfile and/or Polars.

We're also casting the data to the appropriate types. 

In [None]:
archive = zipfile.ZipFile("../../data/hds.zip", "r")
hds_data = pl.concat(
    [pl.read_ndjson(archive.open(f)) for f in archive.namelist()]
).rename({'Customer_C_CIFNBR' : 'Customer_ID'}).with_columns(
    cdh_utils.parse_pega_date_time_formats("Decision_DecisionTime"),
    cdh_utils.parse_pega_date_time_formats("Decision_OutcomeTime"),
    cs.ends_with("_DaysSince", "_pyHistoricalOutcomeCount").cast(pl.Float64),
    pl.col(
        [
            "Customer_NetWealth",
            "Customer_CreditScore",
            "Customer_CLV_VALUE",
            "Customer_RelationshipStartDate",
            "Customer_Date_of_Birth",
            # "Customer_TotalLiabilities",
        ]
    ).cast(pl.Float64),
    cs.starts_with("Param_ExtGroup").cast(pl.Float64),
    pl.col(["Customer_NoOfDependents"]).cast(pl.Float64),
)
hds_data.describe()

## Available fields

See Pega doc for an overview

In [None]:
hds_data_dictionary = (
    pl.DataFrame(
           {"Field" : hds_data.schema.names(),
           "Numeric" : [x.is_numeric() for x in hds_data.schema.dtypes()],}
    )
    .with_columns(
        Category=pl.when(pl.col("Field").str.contains("_", literal=True))
        .then(pl.col("Field").str.replace(r"([^_]+)_.*", "${1}"))
        .otherwise(pl.lit("Internal"))
    )
    .sort("Category")
)
hds_data_dictionary.to_pandas().style.hide()


In [None]:
category_counts = (
    hds_data_dictionary
    .group_by("Category", "Numeric")
    .agg(Count=pl.len())
    .sort("Category")
)
fig = px.bar(
    category_counts, #.to_dict(as_series=False),
    y="Category",
    x="Count",
    title="Number of Fields by Category",
    color="Numeric",
    text="Count",
    orientation="h",
)

fig.update_layout(
    yaxis_title="Field Category",
    xaxis_title="Number of Fields",
)

fig.show()

## Create XGBoost model 

In [None]:
def data_prep(data, data_dictionary):
    categorical_fields = (
        data_dictionary.filter(~pl.col("Numeric"))
        .filter((pl.col("Category") != "Internal") & (pl.col("Category") != "Decision"))
        .select("Field")
        .to_series()
        .to_list()
    )
    numerical_fields = (
        data_dictionary.filter(pl.col("Numeric"))
        .filter((pl.col("Category") != "Internal") & (pl.col("Category") != "Decision"))
        .select("Field")
        .to_series()
        .to_list()
    )

    print(f"Categorical fields: {categorical_fields}")
    print(f"Numerical fields: {numerical_fields}")

    # Convert to pandas for easier encoding with sklearn
    data_pd = data.to_pandas()

    # Create a copy for modeling
    model_data = data_pd.copy()

    # Simple encoding for categorical features
    for col in categorical_fields:
        if col != "Decision_Outcome":
            # Handle missing values
            model_data[col] = model_data[col].fillna("missing")
            # Create a simple label encoder
            le = LabelEncoder()
            model_data[col + "_encoded"] = le.fit_transform(model_data[col])

    # Encode target variable
    le_target = LabelEncoder()
    model_data["target"] = le_target.fit_transform(model_data["Decision_Outcome"])

    # Show target encoding
    target_mapping = dict(zip(le_target.classes_, range(len(le_target.classes_))))
    print(f"\nTarget encoding: {target_mapping}")

    # Select features and target
    feature_cols = [
        col for col in model_data.columns if col.endswith("_encoded")
    ] + numerical_fields
    X = model_data[feature_cols]
    y = model_data["target"]

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    print(f"\nTraining set size: {X_train.shape[0]} samples")
    print(f"Test set size: {X_test.shape[0]} samples")

    return X_train, X_test, y_train, y_test, le_target, feature_cols


X_train, X_test, y_train, y_test, target_encoder, feature_cols = data_prep(hds_data, hds_data_dictionary)

In [None]:
def create_classifier(X_train, X_test, y_train, y_test):
    # Create and train the model
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train, y_train)

    # Make predictions and evaluate
    y_pred = xgb_model.predict(X_test)
    print (f"Model AUC: {round(roc_auc_score(y_test,y_pred), 5)}")

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))

    return xgb_model

classifier = create_classifier(X_train, X_test, y_train, y_test)

In [None]:
def show_feature_imp(classifier):
    importances = classifier.feature_importances_

    # Create a DataFrame for feature importances
    feature_importance_df = (
        pl.DataFrame({"Feature": feature_cols, "Importance": importances.tolist()})
        .with_columns(
            Feature = pl.when(pl.col("Feature").str.ends_with("_encoded")).then(pl.col("Feature").str.replace(r"_encoded$", "")).otherwise(pl.col("Feature"))
        )
        .with_columns(
            Category=pl.when(pl.col("Feature").str.contains("_", literal=True))
            .then(pl.col("Feature").str.replace(r"([^_]+)_.*", "${1}"))
            .otherwise(pl.lit("Internal"))
        )
        .sort("Importance", descending=True)
    )

    # Plot feature importances
    fig = px.bar(
        feature_importance_df.head(20),  # .to_dict(as_series=False),
        x="Importance",
        y="Feature",
        orientation="h",
        title="Feature Importance",
        color="Category",
        # color_continuous_scale="Viridis",
    )

    fig.update_layout(
        xaxis_title="Importance",
        yaxis_title="Feature",
        yaxis=dict(autorange="reversed", dtick=1, type='category'),
    )
    fig.show()

show_feature_imp(classifier)


# Finding new Features

Now, suppose you have some external data from your data lake that you want to consider adding to Pega to improve the performance of your models.

If you have such data, you can merge it with the HDS data and run the model again to see how these features fare against what ADM already uses.

Such data is typically time-stamped, and we need to be careful to only pull in data from before the decisions were made. 

## External data example



We're simulating some features from an external datalake here. One is a "bad" feature that has just random values, the other is artificially created to be a very good feature. 

We re-do the data prep and model building and expect both to show at the top and bottom of the feature importance list.

In [None]:
import random
datalake_fake_data = hds_data.with_columns(
    DataLake_BadFeature=pl.Series([random.random() for _ in range(hds_data.height)]),
    DataLake_GoodFeature=(pl.col("Decision_Outcome") == "Accepted") * 0.9
    + pl.Series([random.random() for _ in range(hds_data.height)]) * 0.1,
).select(
    [
        pl.col("Customer_ID"),
        pl.col("Decision_DecisionTime").dt.truncate("1d"),
        pl.col("DataLake_BadFeature"),
        pl.col("DataLake_GoodFeature"),
    ]
).group_by(
    ["Customer_ID", "Decision_DecisionTime"]
).agg(
    cs.all().mean()
).sort(["Customer_ID", "Decision_DecisionTime"])

datalake_fake_data

In [None]:
augmented_data = hds_data.join_asof(datalake_fake_data, on="Decision_DecisionTime", by="Customer_ID")
augmented_data_dictionary = pl.concat([
    hds_data_dictionary,
    pl.DataFrame({"Field" : ["DataLake_BadFeature", "DataLake_GoodFeature"],	
                  "Numeric" : [True, True],	
                  "Category" : ["DataLake", "DataLake"]})]
)

In [None]:
X_train, X_test, y_train, y_test, target_encoder, feature_cols = data_prep(augmented_data, augmented_data_dictionary)
classifier = create_classifier(X_train, X_test, y_train, y_test)
show_feature_imp(classifier)