In [74]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from tensorflow import keras
from tensorflow.keras import layers

In [75]:
TARGET_COLUMN_NAME = "label"
WEIGHT_COLUMN_NAME = None
NUMERIC_FEATURE_NAMES = [
    'releaseYear',
    'movieRatingCount',
    'movieAvgRating',
    'movieRatingStddev',
    'userRatingCount',
    'userAvgRating',
    'userRatingStddev'
]
CATEGORICAL_FEATURE_NAMES = [
    'userGenre1',
    'userGenre2',
    'userGenre3',
    'userGenre4',
    'userGenre5',
    'movieGenre1',
    'movieGenre2',
    'movieGenre3',
]
# Maximum number of decision trees. The effective number of trained trees can be smaller if early stopping is enabled.
NUM_TREES = 100
# Minimum number of examples in a node.
MIN_EXAMPLES = 6
# Maximum depth of the tree. max_depth=1 means that all trees will be roots.
MAX_DEPTH = 5
# Ratio of the dataset (sampling without replacement) used to train individual trees for the random sampling method.
SUBSAMPLE = 0.65
# Control the sampling of the datasets used to train individual trees.
SAMPLING_METHOD = "RANDOM"
# Ratio of the training dataset used to monitor the training. Require to be >0 if early stopping is enabled.
VALIDATION_RATIO = 0.1


def prepare_dataframe(dataframe):
    # Cast the categorical features to string.
    for feature_name in CATEGORICAL_FEATURE_NAMES:
        dataframe[feature_name] = dataframe[feature_name].astype(str)


def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None):
    train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        train_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
    )    
    test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        test_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
    )
    model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
    _, accuracy, roc, pr = gbt_model.evaluate(test_dataset, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 3)}%,\n"
          f"ROC AUC: {round(roc, 3)},\n"
          f"PR AUC: {round(pr, 3)}")


def specify_feature_usages():
    feature_usages = []

    for feature_name in NUMERIC_FEATURE_NAMES:
        feature_usage = tfdf.keras.FeatureUsage(
            name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL
        )
        feature_usages.append(feature_usage)

    for feature_name in CATEGORICAL_FEATURE_NAMES:
        feature_usage = tfdf.keras.FeatureUsage(
            name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL
        )
        feature_usages.append(feature_usage)

    return feature_usages


def create_gbt_model(preprocessor=None, postprocessor=None):
    gbt_model = tfdf.keras.GradientBoostedTreesModel(
        preprocessing=preprocessor,
        postprocessing=postprocessor,
        num_trees=NUM_TREES,
        max_depth=MAX_DEPTH,
        min_examples=MIN_EXAMPLES,
        subsample=SUBSAMPLE,
        validation_ratio=VALIDATION_RATIO,
        task=tfdf.keras.Task.CLASSIFICATION,
    )

    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy"),
                               keras.metrics.AUC(curve="ROC"),
                               keras.metrics.AUC(curve="PR")])
    return gbt_model


def create_model_inputs():
    inputs = {}

    for feature_name in NUMERIC_FEATURE_NAMES:
        inputs[feature_name] = layers.Input(
            name=feature_name, shape=(), dtype=tf.float32
        )

    for feature_name in CATEGORICAL_FEATURE_NAMES:
        inputs[feature_name] = layers.Input(
            name=feature_name, shape=(), dtype=tf.string
        )

    return inputs


def create_embedding_encoder(size=None):
    inputs = create_model_inputs()
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            # Get the vocabulary of the categorical feature.
            vocabulary = sorted(
                [str(value) for value in list(train_data[feature_name].unique())]
            )
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = layers.StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            # Convert the string input values into integer indices.
            value_index = lookup(inputs[feature_name])
            # Create an embedding layer with the specified dimensions
            vocabulary_size = len(vocabulary)
            embedding_size = int(math.sqrt(vocabulary_size))
            feature_encoder = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_size
            )
            # Convert the index values to embedding representations.
            encoded_feature = feature_encoder(value_index)
        else:
            # Expand the dimensions of the numerical input feature and use it as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
        # Add the encoded feature to the list.
        encoded_features.append(encoded_feature)
    # Concatenate all the encoded features.
    encoded_features = layers.concatenate(encoded_features, axis=1)
    # Apply dropout.
    encoded_features = layers.Dropout(rate=0.25)(encoded_features)
    # Perform non-linearity projection.
    encoded_features = layers.Dense(
        units=size if size else encoded_features.shape[-1], activation="gelu"
    )(encoded_features)
    # Create and return a Keras model with encoded features as outputs.
    return keras.Model(inputs=inputs, outputs=encoded_features)


def create_nn_model(encoder):
    inputs = create_model_inputs()
    embeddings = encoder(inputs)
    output = layers.Dense(units=1, activation="sigmoid")(embeddings)

    nn_model = keras.Model(inputs=inputs, outputs=output)
    nn_model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy("accuracy")],
    )
    return nn_model

In [76]:
def leaves_index_to_onehot(prediction):
    sample_size, num_trees = prediction.shape
    num_leaves = np.max(prediction) + 1
    transformed_training_matrix = np.zeros([sample_size, num_trees * num_leaves],
                                       dtype=np.int64)
    for i in range(0, sample_size):
        temp = np.arange(num_trees) * num_leaf + np.array(prediction[i])
        transformed_training_matrix[i][temp] += 1
    return transformed_training_matrix

In [77]:
def create_logistic_model(num_trees=NUM_TREES, num_leaves=2**(MAX_DEPTH-1)):
    model = keras.models.Sequential()
    model.add(layers.Dense(1, activation = 'sigmoid', input_dim=num_trees*num_leaves))
    model.compile(optimizer=keras.optimizers.Adam(),
                  loss=keras.losses.BinaryCrossentropy(),
                  metrics=[keras.metrics.BinaryAccuracy(name="accuracy"),
                          keras.metrics.AUC(curve="ROC"),
                          keras.metrics.AUC(curve="PR")])
    return model

In [78]:
training_samples_file_path = "../data/trainingSamples.csv"
test_samples_file_path = "../data/testSamples.csv"
train_data = pd.read_csv(training_samples_file_path)
test_data = pd.read_csv(test_samples_file_path)
prepare_dataframe(train_data)
prepare_dataframe(test_data)

In [79]:
# GBDT classifier with feature embedding
embedding_encoder = create_embedding_encoder(size=64)
run_experiment(
    create_nn_model(embedding_encoder),
    train_data,
    test_data,
    num_epochs=5,
    batch_size=256,
)

  return bool(asarray(a1 == a2).all())


Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 65.08%,
ROC AUC: 0.709,
PR AUC: 0.751


In [80]:
gbt_model = create_gbt_model(embedding_encoder)
run_experiment(gbt_model, train_data, test_data)

Use /var/folders/yn/k_cr94fd387chb6sg2ttcnzc0000gn/T/tmpen1s0x77 as temporary training directory




Reading training dataset...


  inputs = self._flatten_to_reference_inputs(inputs)


Training dataset read in 0:00:00.892908. Found 88827 examples.
Training model...
Model trained in 0:00:06.795492
Compiling model...
Model compiled.


[INFO 24-01-05 11:10:08.8341 CST kernel.cc:1233] Loading model from path /var/folders/yn/k_cr94fd387chb6sg2ttcnzc0000gn/T/tmpen1s0x77/model/ with prefix 3a0e7bd7b6834ee1
[INFO 24-01-05 11:10:08.8376 CST abstract_model.cc:1344] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 24-01-05 11:10:08.8376 CST kernel.cc:1061] Use fast generic engine


Test accuracy: 64.354%,
ROC AUC: 0.7,
PR AUC: 0.746


In [81]:
train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
    train_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
)
test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
)

In [82]:
train_lr = leaves_index_to_onehot(gbt_model.predict_get_leaves(train_dataset))
test_lr = leaves_index_to_onehot(gbt_model.predict_get_leaves(test_dataset))

[INFO 24-01-05 11:10:09.4878 CST kernel.cc:1233] Loading model from path /var/folders/yn/k_cr94fd387chb6sg2ttcnzc0000gn/T/tmpen1s0x77/model/ with prefix 3a0e7bd7b6834ee1
[INFO 24-01-05 11:10:09.4914 CST kernel.cc:1079] Use slow generic engine
  inputs = self._flatten_to_reference_inputs(inputs)


In [83]:
lr_model = create_logistic_model()
lr_model.fit(train_lr, train_data.label, epochs=10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2906b7850>

In [84]:
_, accuracy, roc, pr = lr_model.evaluate(test_lr, test_data.label, verbose=0)
print(f"Test accuracy: {round(accuracy * 100, 2)}%,\n"
      f"ROC AUC: {round(roc, 3)},\n"
      f"PR AUC: {round(pr, 3)}")

Test accuracy: 65.28%,
ROC AUC: 0.71,
PR AUC: 0.749
