In [None]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

import sys
sys.path.append('..')

from lib.WTTE import WTTE
from lib.ChurnEnsemble import ChurnEnsemble
from lib.utils import format_number, show_summary

SEED = 42
np.random.seed(SEED)

In [None]:
config = dict(
    features=[
        'plan', 'interval', 'country_es', 'country_mx', 'country_latam', 'gateway_auto',
        'failed', 'usage', 'usage_groups', 'usage_payments', 'momentum'
    ],
    params=dict(
        epochs=10,  # Number of epochs
        lr=1e-4,  # Learning rate
        batch=256,  # Batch size
        stop=0,  # Early stopping patience
        hl=2,
        max_beta=2.
    )
)

max_sl = 24  # Maximum sequence length (0 = max length from data)
min_tte = 1  # Minimum time to event for binary classification (positive if `tte` <= `min_tte`)
test_size = 0.25  # Percentage of the data to use for test/validation

In [None]:
data = pd.read_csv('../files/churn-data-fit.csv')

for col in ['tp', 'ts', 'te']:
    data[col] = pd.to_datetime(data[col])

cs = (data.sort_values(['id', 'tfs']).groupby('id')['tte'].last() < 0).value_counts().sort_index().astype(float)
print('Total Customers: {} | Censored: {} | Non-censored: {} | Censored Rate {}%'.format(
    format_number(cs.sum()),
    format_number(cs[1]),
    format_number(cs[0]),
    format_number(100 * cs[1] / cs.sum(), 2)
))

data

In [None]:
d_split = data.sort_values(['id', 'tp']).groupby('id')['tte'].last().reset_index()
d_split['censored'] = d_split['tte'] < 0

d_train, d_test = train_test_split(
    d_split,
    test_size=test_size,
    shuffle=True,
    stratify=d_split['censored'].astype(int),
    random_state=SEED
)

cs_train = d_train['censored'].value_counts().sort_index().astype(float)
cs_test = d_test['censored'].value_counts().sort_index().astype(float)

print('Total Customers: {} ({}% censored) | Train: {} ({}%) | Test: {} ({}%)'.format(
    format_number(len(d_split)),
    format_number(100 * cs[1] / cs.sum(), 2),
    format_number(len(d_train)),
    format_number(100 * cs_train[1] / cs_train.sum(), 2),
    format_number(len(d_test)),
    format_number(100 * cs_test[1] / cs_test.sum(), 2)
))

In [None]:
# Instantiate the WTTE Time To Event model
wtte = WTTE(
    features=config['features'],
    max_sl=max_sl,
    min_tte=min_tte,
    seed=SEED,
    verbose=1,
    path='../files/wtte',
    **config['params']
)

wtte.params

In [None]:
# Select train data
d_wtte_train = data[data['id'].isin(d_train['id'])].sort_values(['id', 'tfs'])[
    ['id', 'tfs', 'tte'] + wtte.features
]

# Scale/Normalize features
wtte.scaler = StandardScaler().fit(d_wtte_train[wtte.features])
d_wtte_train[wtte.features] = wtte.scaler.transform(d_wtte_train[wtte.features])

# Build train tensor
x_wtte_train, y_wtte_train = wtte.build_seq(d_wtte_train, deep=False)
df_wtte_train = wtte.seq_to_df(x_wtte_train, y_wtte_train)

print(x_wtte_train.shape, y_wtte_train.shape)
df_wtte_train

In [None]:
# Select test data
d_wtte_test = data[data['id'].isin(d_test['id'])].sort_values(['id', 'tfs'])[
    ['id', 'tfs', 'tte'] + wtte.features
]

# Scale/Normalize features (using the scaler from the training data)
d_wtte_test[wtte.features] = wtte.scaler.transform(d_wtte_test[wtte.features])

# Build test tensor
x_wtte_test, y_wtte_test = wtte.build_seq(d_wtte_test, deep=False)
df_wtte_test = wtte.seq_to_df(x_wtte_test, y_wtte_test)

print(x_wtte_test.shape, y_wtte_test.shape)
df_wtte_test

In [None]:
# Fit WTTE model
wtte.fit(x_wtte_train, y_wtte_train, x_wtte_test, y_wtte_test)

In [None]:
# Save model
wtte.save()

# Plot training history
wtte.plot_history_eval()

In [None]:
wtte.weightwatcher.plot()

In [None]:
# Get sequence lengths
wtte.sls = wtte.get_seq_lengths(y_wtte_test)
# Predict
y_wtte_hat = wtte.predict(x_wtte_test)
# Set results
wtte.set_results(y_wtte_hat, y_wtte_test)

# Plot the distribution of the Weibull alpha and beta parameters for all customers in the given data.
wtte.plot_params_dist(wtte.results, loc=-1)

In [None]:
# Prediction results
wtte.results

In [None]:
# Select random customers from the results
n_samples = 6  # Number of customers
min_periods = 6  # Minimum number of periods recorded

ids = shuffle(
    wtte.sls[wtte.sls['length'] > 6]['id'].tolist()
)[:n_samples]

print(ids)

In [None]:
"""
Plot the distribution of the Weibull alpha and beta parameters
for a single customer over time, showing how the parameters change
from one period to the next as the customer info and usage change.

The alpha parameter represents the scale of the Weibull distribution,
which denotes the time it takes for the customer to churn,
while the beta parameter represents the shape of the Weibull distribution,
which is a measure of dispersion, meaning how sure we are about the result.
"""

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(16, 8), constrained_layout=True)
axs = axs.flatten()

for i, uid in enumerate(ids):
    wtte.plot_single_params(wtte.results, id=uid, ax=axs[i])

plt.suptitle('Single Weibull Alpha and Beta Evolution', y=1.03)

plt.show()

In [None]:
"""
Plot both the probability and cumulative functions for a single customer at a specific period,
showing how the survival of the customer is modeled by the Weibull distribution.

Basically, the probability function shows the probability of the customer
to churn at a given time (the peak of the distribution is the most probable churn period),
while the cumulative function shows the probability of the customer to churn before any given time.
"""

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(16, 8), constrained_layout=True)
axs = axs.flatten()

for i, uid in enumerate(ids):
    wtte.plot_weibull(wtte.results, id=uid, loc=-1, ax=axs[i])

plt.suptitle('Single WTTE Weibull PDF/CDF Distribution', y=1.03)

plt.show()

In [None]:
# Get the last sequence prediction for each customer
results = (
    wtte.results
    .sort_values(['id', 'tfs'])
    .groupby('id')
    .last()
    .drop(columns=['tfs'])
    .reset_index()
)

show_summary(results, 'true')
results

In [None]:
# Instantiate Ensemble model in order to compute and plot the scores
model = ChurnEnsemble(
    min_tte=min_tte,
    seed=SEED,
    verbose=1,
    path='../files'
)
# Set the WTTE model
model.wtte = wtte

model

In [None]:
# Compute and set scores
model.set_scores(wtte.results)

# Plot scores summary
model.plot_scores()

In [None]:
# Plot histogram of predicted probabilities for each customer sequence
model.plot_histogram(wtte.results, loc=-1)