# Preamble

In [1]:
# Imports
## General
import numpy as np
import os
import sys

## In order to run calculations on AWS GPU, need to explicitly specify CUDA lib directory in the environment variables
os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/home/sagemaker-user/.conda/envs/mlds_gpu"

## Data manipulation and preprocessing
import pandas as pd
import boto3
from tensorflow.keras.layers import StringLookup, Normalization

## Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Image

## Modelling
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
import tensorflow as tf

## Import DeepCTR code
## This is done by cloning the github repository instead of installing with pip. This is because of an incompatibility issue
## with TF 2.14 that I had to manually fix in the DeepCTR code
deepctr_path = '/home/sagemaker-user/drl-ad-personalization/DeepCTR'
if deepctr_path not in sys.path:
    sys.path.append(deepctr_path)

from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from deepctr.models.dcn import DCN

2024-08-16 08:43:30.341490: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-16 08:43:30.341534: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-16 08:43:30.341545: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-16 08:43:30.617109: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Pretraining

## Data Extraction

In [2]:
# Retrive the pretraining train and validation datasets

train_ds = tf.data.experimental.make_csv_dataset(
    "data/kdd12/rl_data/pretraining/train/*",
    batch_size=1,
    field_delim=',',
    header=True,
    column_defaults=['int32','int32','string','string','string','int32','int32','string','string','string','string','string'],
    num_epochs=1,
    shuffle=False,
    compression_type='GZIP'
)

val_ds = tf.data.experimental.make_csv_dataset(
    "data/kdd12/rl_data/pretraining/test/*",
    batch_size=1,
    field_delim=',',
    header=True,
    column_defaults=['int32','int32','string','string','string','int32','int32','string','string','string','string','string'],
    num_epochs=1,
    shuffle=False,
    compression_type='GZIP'
)

2024-08-16 08:43:38.397677: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-16 08:43:38.707276: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-16 08:43:38.709151: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [3]:
# Preprocess the datasets
## Define function to derive ctr and split this as the target
@tf.function
def kdd12_target(element):
    features = element.copy()
    click = features.pop(key='click')
    impression = features.pop(key="impression")
    ctr_label = tf.where(tf.math.greater_equal(tf.math.divide(click,impression),t),1.,0.)
    return features, ctr_label

train_ds = train_ds.map(kdd12_target)
val_ds = val_ds.map(kdd12_target)

## Create lists of categorical colums for each dataset
kdd12_categorical_columns = [
    'DisplayURL',
    'AdID',
    'AdvertiserID',
    'QueryID',
    'KeywordID',
    'TitleID',
    'DescriptionID',
    'UserID'
]

# Import categorical feature mappings and define stringloohup objects for each dataset
kdd12_stringlookups = {}
kdd12_vocab_lengths = {}
for field in kdd12_categorical_columns:
    df = pd.read_csv(f'./data/kdd12/categorical_value_counts/{field}.csv')
    vocab = [elem.encode() for elem in df['field'].astype(str).to_list()]
    lookup = StringLookup(vocabulary=vocab, mask_token=None)
    kdd12_stringlookups.update({field:lookup})
    kdd12_vocab_lengths.update({field:len(vocab)+1})

# Define categorical encoding function
@tf.function
def kdd12_categorical_encoding(features,label):
    # Create copy of features, because modifying inputs causes a ValueError
    out_features = features.copy()
    # Iteratively map the categical feature columns using the corresponging Lookup layer
    for f in kdd12_categorical_columns:
        lookup = kdd12_stringlookups[f]
        out_features[f.lower()] = lookup(features[f.lower()])
    return out_features, label

train_ds= train_ds.map(kdd12_categorical_encoding)
val_ds = val_ds.map(kdd12_categorical_encoding)


# Define numerical feature columns
kdd12_numerical_columns = [
    'Depth',
    'Position'
]
# Extract scaler dicts for all datasets
dist_stats = pd.read_csv('./data/kdd12/means_variances.csv')
kdd12_scalers = {}
for i in range(len(dist_stats)):
    field = dist_stats['field'][i]
    mean = dist_stats['mean'][i]
    variance = dist_stats['variance'][i]
    scaler = Normalization(mean=mean, variance=variance)
    scaler.build((1,))
    kdd12_scalers.update({field:scaler})

# Define scaler functions for all datasets

@tf.function
def kdd12_numerical_scaling(features,label):
    out_features = features.copy()
    for f in kdd12_numerical_columns:
        scaler = kdd12_scalers[f]
        out_features[f.lower()] = scaler(features[f.lower()])
    return out_features, label

train_ds = train_ds.map(kdd12_numerical_scaling)
val_ds = val_ds.map(kdd12_numerical_scaling)

In [4]:
train_ds = train_ds.shuffle(100).take(157440).batch(256)
val_ds = val_ds.take(39360).batch(256)

In [5]:
## Define feature mappings
kdd12_fixlen_feature_columns = [SparseFeat(feat.lower(), vocabulary_size=kdd12_vocab_lengths[feat], embedding_dim=4) for feat in kdd12_categorical_columns]\
+ [DenseFeat(feat.lower(),1) for feat in kdd12_numerical_columns]

## Generate the dnn and linear feature columns
kdd12_dnn_feature_columns = kdd12_fixlen_feature_columns
kdd12_linear_feature_columns = kdd12_fixlen_feature_columns

In [6]:
# Define the early stopping callback
earlystopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    start_from_epoch=5
)
# Define the precision, recall and auc metrics
precision = tf.keras.metrics.Precision(thresholds=0.5,name='precision')
recall = tf.keras.metrics.Recall(thresholds=0.5,name='recall')
auc = tf.keras.metrics.AUC(name='auc')

# Define the csvLogger callback
csvLogger = CSVLogger('logs/final_rl_model.csv')

# Define the model checkpoint callback
modelCheckpoint = ModelCheckpoint(
    'models/final_rl_model/rl_model.ckpt',
    save_best_only=True,
    save_weights_only=True
)

In [7]:
# Construct the model per the hyperparameter tuning
model = DCN(
    kdd12_linear_feature_columns, 
    kdd12_dnn_feature_columns, 
    task='binary',
    dnn_hidden_units=[400,400],
    dnn_dropout=0.6,
    l2_reg_dnn=0.005,
    l2_reg_linear = 0.005,
    l2_reg_embedding=0.005,
    dnn_use_bn=True,
    cross_num=2
)

CrossNet parameterization: vector


In [8]:
# Compile the model
model.compile(
    "adam", 
    "binary_crossentropy", 
    metrics=[
        'binary_crossentropy',
        'binary_accuracy',
        precision,
        recall,
        auc
    ],
)

In [9]:
model.fit(
    train_ds,
    validation_data=val_ds,
    batch_size=256,
    epochs=15,
    callbacks=[
        csvLogger,
        modelCheckpoint,
        earlystopping
    ],
)

Epoch 1/15


2024-08-16 08:44:58.955429: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-08-16 08:45:01.556302: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f14693a4460 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-08-16 08:45:01.556348: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A10G, Compute Capability 8.6
2024-08-16 08:45:01.597108: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-08-16 08:45:01.682169: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907
2024-08-16 08:45:01.813374: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


    614/Unknown - 28s 30ms/step - loss: 0.5019 - binary_crossentropy: 0.2095 - binary_accuracy: 0.9427 - precision: 0.0499 - recall: 0.0112 - auc: 0.5567

2024-08-16 08:45:28.480425: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 11202395199338273436


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7f1628484d90>