In [1]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import keras
from keras import ops as K
import keras_tuner as kt
import numpy as np
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

import sys
sys.path.append('..')

from lib.WTTE import WTTE
from lib.utils import format_number

SEED = 42
np.random.seed(SEED)

2024-06-28 16:21:23.261287: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
config = dict(
    features=[
        'plan', 'interval', 'country_es', 'country_mx', 'country_latam', 'gateway_auto',
        'failed', 'usage', 'usage_groups', 'usage_payments', 'momentum'
    ],
    params=dict(
        max_beta=2.  # Maximum beta value
    )
)

min_tte = 1  # Minimum time to event for binary classification (positive if `tte` <= `min_tte`)
max_sl = 24  # Maximum sequence length (0 = max length from data)
wlevel = 'batch'  # Level to save the weights and bias (epoch or batch)
test_size = 0.25  # Percentage of the data to use for test/validation

In [3]:
data = pd.read_csv('../files/churn-data-fit.csv')

for col in ['tp', 'ts', 'te']:
    data[col] = pd.to_datetime(data[col])

cs = (data.sort_values(['id', 'tfs']).groupby('id')['tte'].last() < 0).value_counts().sort_index().astype(float)
print('Total Customers: {} | Censored: {} | Non-censored: {} | Censored Rate {}%'.format(
    format_number(cs.sum()),
    format_number(cs[1]),
    format_number(cs[0]),
    format_number(100 * cs[1] / cs.sum(), 2)
))

data

Total Customers: 11.069 | Censored: 2.678 | Non-censored: 8.391 | Censored Rate 24,19%


Unnamed: 0,cid,id,tp,tfs,tte,ts,te,employees,mrr,value,...,usage_avg,usage_groups_avg,usage_payments_avg,paid_periods,failed_periods,active_periods,momentum,months,failed_ratio,usage_diff
0,51b46d18c4de615d0f000019,1,2017-07-01,4.0,-1.0,2017-03-01,NaT,1,26.58,159.50,...,2.00,0.0,0.0,0.0,0.0,1.0,0.00,4.0,,0.00
1,51b46d18c4de615d0f000019,1,2017-08-01,5.0,-1.0,2017-03-01,NaT,1,26.58,159.50,...,2.00,0.0,0.0,0.0,0.0,2.0,1.20,5.0,,0.00
2,51b46d18c4de615d0f000019,1,2017-09-01,6.0,-1.0,2017-03-01,NaT,1,26.58,239.25,...,2.00,0.0,0.0,1.0,0.0,3.0,-3.14,6.0,0.0,0.00
3,51b46d18c4de615d0f000019,1,2017-10-01,7.0,-1.0,2017-03-01,NaT,1,26.58,239.25,...,1.75,0.0,0.0,1.0,0.0,3.0,-13.60,7.0,0.0,-0.75
4,51b46d18c4de615d0f000019,1,2017-11-01,8.0,-1.0,2017-03-01,NaT,1,26.58,239.25,...,1.60,0.0,0.0,1.0,0.0,3.0,-18.57,8.0,0.0,-0.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201230,66633bcee489710bb6afe3d9,11065,2024-06-01,0.0,-1.0,2024-06-01,NaT,1,101.97,101.97,...,0.00,0.0,0.0,1.0,0.0,0.0,0.00,0.0,0.0,0.00
201231,666346aee489710bb6afead3,11066,2024-06-01,0.0,-1.0,2024-06-01,NaT,1,40.00,40.00,...,0.00,0.0,0.0,1.0,0.0,0.0,0.00,0.0,0.0,0.00
201232,66634aea1e0bd65a73bf4f2f,11067,2024-06-01,0.0,-1.0,2024-06-01,NaT,1,46.62,46.62,...,0.00,0.0,0.0,1.0,0.0,0.0,0.00,0.0,0.0,0.00
201233,666388b3e489710bb6b00925,11068,2024-06-01,0.0,-1.0,2024-06-01,NaT,1,60.19,60.19,...,0.00,0.0,0.0,1.0,0.0,0.0,0.00,0.0,0.0,0.00


In [4]:
d_split = data.sort_values(['id', 'tp']).groupby('id')['tte'].last().reset_index()
d_split['censored'] = d_split['tte'] < 0

d_train, d_test = train_test_split(
    d_split,
    test_size=test_size,
    shuffle=True,
    stratify=d_split['censored'].astype(int),
    random_state=SEED
)

cs_train = d_train['censored'].value_counts().sort_index().astype(float)
cs_test = d_test['censored'].value_counts().sort_index().astype(float)

print('Total Customers: {} ({}% censored) | Train: {} ({}%) | Test: {} ({}%)'.format(
    format_number(len(d_split)),
    format_number(100 * cs[1] / cs.sum(), 2),
    format_number(len(d_train)),
    format_number(100 * cs_train[1] / cs_train.sum(), 2),
    format_number(len(d_test)),
    format_number(100 * cs_test[1] / cs_test.sum(), 2)
))

Total Customers: 11.069 (24,19% censored) | Train: 8.301 (24,19%) | Test: 2.768 (24,21%)


In [5]:
# Instantiate the WTTE Time To Event model
wtte = WTTE(
    features=config['features'],
    min_tte=min_tte,
    max_sl=max_sl,
    wlevel=wlevel,
    seed=SEED,
    verbose=1,
    path='../files/tune'
)

wtte.params

{'nn': 0,
 'hl': 2,
 'lr': 0.01,
 'epochs': 100,
 'batch': 64,
 'lr_decay': 0,
 'stop': 0,
 'dropout': 0.1,
 'lnorm': True,
 'weight_l1': 0,
 'weight_l2': 1e-05,
 'init_alpha': None,
 'max_beta': 2.0,
 'epsilon': 1e-08}

In [6]:
# Set train data
d_wtte_train = data[data['id'].isin(d_train['id'])].sort_values(['id', 'tfs'])[
    ['id', 'tfs', 'tte'] + wtte.features
]

# Scale/Normalize features
wtte.scaler = StandardScaler().fit(d_wtte_train[wtte.features])
d_wtte_train[wtte.features] = wtte.scaler.transform(d_wtte_train[wtte.features])

# Build train tensor
x_wtte_train, y_wtte_train = wtte.build_seq(d_wtte_train, deep=False)
df_wtte_train = wtte.seq_to_df(x_wtte_train, y_wtte_train)

print(x_wtte_train.shape, y_wtte_train.shape)
df_wtte_train

(8301, 25, 14) (8301, 25, 5)


Unnamed: 0,id,seq,tfs,wa,wb,plan,interval,country_es,country_mx,country_latam,gateway_auto,failed,usage,usage_groups,usage_payments,momentum
0,1.0,63.0,0.0,24.0,0.0,-0.574489,0.171768,1.198058,-0.542737,-0.57828,0.183716,2.009340,-1.3876,-0.206605,-0.093025,-0.814738
1,1.0,63.0,1.0,23.0,0.0,-0.574489,0.171768,1.198058,-0.542737,-0.57828,0.183716,2.009340,-1.3876,-0.206605,-0.093025,-0.654770
2,1.0,63.0,2.0,22.0,0.0,-0.574489,0.171768,1.198058,-0.542737,-0.57828,0.183716,-0.497676,-1.3876,-0.206605,-0.093025,-0.441886
3,1.0,63.0,3.0,21.0,0.0,-0.574489,0.171768,1.198058,-0.542737,-0.57828,0.183716,-0.497676,-1.3876,-0.206605,-0.093025,0.714988
4,1.0,63.0,4.0,20.0,0.0,-0.574489,0.171768,1.198058,-0.542737,-0.57828,0.183716,-0.497676,-1.3876,-0.206605,-0.093025,-0.173044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207520,11069.0,0.0,20.0,,,,,,,,,,,,,
207521,11069.0,0.0,21.0,,,,,,,,,,,,,
207522,11069.0,0.0,22.0,,,,,,,,,,,,,
207523,11069.0,0.0,23.0,,,,,,,,,,,,,


In [7]:
# Set test data
d_wtte_test = data[data['id'].isin(d_test['id'])].sort_values(['id', 'tfs'])[
    ['id', 'tfs', 'tte'] + wtte.features
]

# Scale/Normalize features (using the scaler from the training data)
d_wtte_test[wtte.features] = wtte.scaler.transform(d_wtte_test[wtte.features])

# Build test tensor
x_wtte_test, y_wtte_test = wtte.build_seq(d_wtte_test, deep=False)
df_wtte_test = wtte.seq_to_df(x_wtte_test, y_wtte_test)

print(x_wtte_test.shape, y_wtte_test.shape)
df_wtte_test

(2768, 25, 14) (2768, 25, 5)


Unnamed: 0,id,seq,tfs,wa,wb,plan,interval,country_es,country_mx,country_latam,gateway_auto,failed,usage,usage_groups,usage_payments,momentum
0,9.0,4.0,0.0,4.0,1.0,-0.574489,-0.400906,1.198058,-0.542737,-0.57828,0.183716,-0.497676,-0.142468,-0.206605,-0.093025,-0.097014
1,9.0,4.0,1.0,3.0,1.0,-0.574489,-0.400906,1.198058,-0.542737,-0.57828,0.183716,2.009340,-0.142468,-0.206605,-0.093025,-0.073900
2,9.0,4.0,2.0,2.0,1.0,-0.574489,-0.400906,1.198058,-0.542737,-0.57828,0.183716,-0.497676,-0.142468,-0.206605,-0.093025,-0.077550
3,9.0,4.0,3.0,1.0,1.0,-0.574489,-0.400906,1.198058,-0.542737,-0.57828,0.183716,2.009340,-0.142468,-0.206605,-0.093025,-0.139590
4,9.0,4.0,4.0,0.0,1.0,-0.574489,-0.400906,1.198058,-0.542737,-0.57828,0.183716,2.009340,-0.765034,-0.206605,-0.093025,-0.507576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69195,11066.0,0.0,20.0,,,,,,,,,,,,,
69196,11066.0,0.0,21.0,,,,,,,,,,,,,
69197,11066.0,0.0,22.0,,,,,,,,,,,,,
69198,11066.0,0.0,23.0,,,,,,,,,,,,,


In [8]:
wtte.init_model(x_wtte_train, y_wtte_train)

x_wtte_train, y_wtte_train, _ = wtte.input_seq(x_wtte_train, y_wtte_train)
x_wtte_test, y_wtte_test, _ = wtte.input_seq(x_wtte_test, y_wtte_test)

x_wtte_train.shape, y_wtte_train.shape, x_wtte_test.shape, y_wtte_test.shape

discrete -> Max Length: 24 | Mask: -6.44
Alpha Mean: 8.75 | Beta Mean: 0.63
Init Alpha: 9.24 | Max Beta: 2.00


((8301, 25, 11), (8301, 25, 2), (2768, 25, 11), (2768, 25, 2))

In [9]:
class HM_WTTE(kt.HyperModel):

    params = {}
    wtte = None

    def __init__(self, wtte, **kwargs):
        super().__init__(**kwargs)
        self.wtte = wtte

    def build_model(self) -> keras.Model:
        if self.params['nn'] == 0:
            self.params['nn'] = wtte.max_sl if wtte.max_sl > 0 else 1

        regularizer = None
        if self.params['weight_l1'] > 0 and self.params['weight_l2'] > 0:
            regularizer = keras.regularizers.l1_l2(l1=self.params['weight_l1'], l2=self.params['weight_l2'])
        elif self.params['weight_l1'] > 0:
            regularizer = keras.regularizers.l1(self.params['weight_l1'])
        elif self.params['weight_l2'] > 0:
            regularizer = keras.regularizers.l2(self.params['weight_l2'])

        model = keras.models.Sequential()

        model.add(
            keras.layers.Masking(
                mask_value=wtte.mask,
                input_shape=(None, len(wtte.features))
            )
        )

        for _ in np.arange(self.params['hl']):
            model.add(
                keras.layers.LSTM(
                    units=self.params['nn'],
                    activation='tanh',
                    dropout=self.params['dropout'],
                    return_sequences=True
                )
            )

            if self.params['norm'] == 'layer':
                model.add(
                    keras.layers.LayerNormalization(
                        epsilon=keras.config.epsilon()
                    )
                )
            elif self.params['norm'] == 'batch':
                model.add(
                    keras.layers.BatchNormalization(
                        momentum=.95,
                        epsilon=keras.config.epsilon()
                    )
                )

        model.add(
            keras.layers.Dense(
                units=2,
                kernel_regularizer=regularizer
            )
        )
        model.add(
            keras.layers.Activation(
                self.activation
            )
        )

        model.compile(
            optimizer=keras.optimizers.Adam(
                learning_rate=self.params['lr'],
                clipnorm=1.
            ),
            loss=self.loss
        )

        return model

    def activation(self, ab):
        epsilon = keras.config.epsilon()

        a = ab[..., 0]
        b = ab[..., 1]

        a = self.params['init_alpha'] * K.exp(a)
        b = self.params['max_beta'] * K.clip(K.sigmoid(b - K.log(self.params['max_beta'] - 1.)), epsilon, 1. - epsilon)

        x = K.stack([a, b], axis=-1)

        return x

    def loss(self, y_true, y_pred):
        epsilon = keras.config.epsilon()

        a_true = y_true[..., 0]
        b_true = y_true[..., 1]
        a_pred = y_pred[..., 0]
        b_pred = y_pred[..., 1]

        chf0 = K.power((a_true + epsilon) / a_pred, b_pred)
        chf1 = K.power((a_true + 1.) / a_pred, b_pred)

        loglik = b_true * K.log(K.exp(chf1 - chf0) - 1.) - chf1
        loglik = K.clip(loglik, K.log(epsilon), K.log(1. - epsilon))

        return -K.mean(loglik)

    def fit(self, hp, model, *args, **kwargs):
        keras.backend.clear_session()
        keras.config.set_epsilon(1e-8)
        tf.random.set_seed(SEED)

        return model.fit(
            *args,
            batch_size=self.params['batch'],
            **kwargs
        )

    def build(self, hp):
        self.params = {
            'nn': 0,
            'hl': 2,
            'lr': 1e-4,
            'batch': 128,
            'dropout': 0,
            'norm': 'layer',
            'weight_l1': 0,
            'weight_l2': 0,
            'init_alpha': wtte.init_alpha,
            'max_beta': wtte.max_beta
        }

        return self.build_model()


shutil.rmtree('../files/tune')

callbacks = [
    keras.callbacks.TerminateOnNaN(),
    keras.callbacks.TensorBoard(
        log_dir='../files/tune/logs',
        histogram_freq=1,
        write_images=True
    )
]

tuner = kt.GridSearch(
    HM_WTTE(wtte=wtte),
    objective='val_loss',
    executions_per_trial=1,
    overwrite=True,
    directory='../files/tune',
    project_name='wtte'
)

tuner.search(
    x_wtte_train, y_wtte_train,
    epochs=60,
    shuffle=True,
    validation_data=(x_wtte_test, y_wtte_test),
    callbacks=callbacks
)

tuner.results_summary()

Trial 5 Complete [00h 02m 25s]
val_loss: 2.0061111450195312

Best val_loss So Far: 1.998154640197754
Total elapsed time: 00h 11m 47s
Results summary
Results in ../files/tune/wtte
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 0001 summary
Hyperparameters:
reg: 1
Score: 1.998154640197754

Trial 0004 summary
Hyperparameters:
reg: 4
Score: 2.0061111450195312

Trial 0000 summary
Hyperparameters:
reg: 0
Score: 2.0072109699249268

Trial 0003 summary
Hyperparameters:
reg: 3
Score: 2.009260416030884

Trial 0002 summary
Hyperparameters:
reg: 2
Score: 2.0102427005767822
