In [None]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import sys
sys.path.append('..')

from lib.XGB import XGB
from lib.ChurnEnsemble import ChurnEnsemble
from lib.utils import format_number, show_summary

SEED = 42
np.random.seed(SEED)

In [None]:
config = dict(
    features=[
        'employees', 'interval', 'country_es', 'country_mx', 'country_latam', 'gateway_auto',
        'plan', 'usage', 'usage_groups', 'usage_payments', 'usage_avg',
        'paid_periods', 'months', 'failed_ratio'
    ],
    params=dict(
        n=100,
        lr=1e-2,
        max_depth=16,
        stop=30,
        reg_unb=True
    )
)

min_tte = 1  # Minimum time to event for binary classification (positive if `tte` <= `min_tte`)
test_size = 0.25  # Percentage of the data to use for test/validation

In [None]:
data = pd.read_csv('../files/churn-data-fit.csv')

for col in ['tp', 'ts', 'te']:
    data[col] = pd.to_datetime(data[col])

cs = (data.sort_values(['id', 'tfs']).groupby('id')['tte'].last() < 0).value_counts().sort_index().astype(float)
print('Total Customers: {} | Censored: {} | Non-censored: {} | Censored Rate {}%'.format(
    format_number(cs.sum()),
    format_number(cs[1]),
    format_number(cs[0]),
    format_number(100 * cs[1] / cs.sum(), 2)
))

data

In [None]:
d_split = data.sort_values(['id', 'tp']).groupby('id')['tte'].last().reset_index()
d_split['censored'] = d_split['tte'] < 0

d_train, d_test = train_test_split(
    d_split,
    test_size=test_size,
    shuffle=True,
    stratify=d_split['censored'].astype(int),
    random_state=SEED
)

cs_train = d_train['censored'].value_counts().sort_index().astype(float)
cs_test = d_test['censored'].value_counts().sort_index().astype(float)

print('Total Customers: {} ({}% censored) | Train: {} ({}%) | Test: {} ({}%)'.format(
    format_number(len(d_split)),
    format_number(100 * cs[1] / cs.sum(), 2),
    format_number(len(d_train)),
    format_number(100 * cs_train[1] / cs_train.sum(), 2),
    format_number(len(d_test)),
    format_number(100 * cs_test[1] / cs_test.sum(), 2)
))

In [None]:
# Instantiate the XGBoost model
xgb = XGB(
    features=config['features'],
    min_tte=min_tte,
    seed=SEED,
    verbose=1,
    path='../files/xgb',
    **config['params']
)

xgb.params

In [None]:
# Select train data
d_xgb_train = data[data['id'].isin(d_train['id'])].sort_values(['id', 'tfs'])[
    ['id', 'tfs', 'tte'] + xgb.features
]

# Build train tensor
x_xgb_train, y_xgb_train = xgb.build_seq(d_xgb_train)
df_xgb_train = xgb.seq_to_df(x_xgb_train, y_xgb_train)

show_summary(df_xgb_train, xgb.tgt_col)
df_xgb_train

In [None]:
# Select test data
d_xgb_test = data[data['id'].isin(d_test['id'])].sort_values(['id', 'tfs'])[
    ['id', 'tfs', 'tte'] + xgb.features
]

# Build test tensor
x_xgb_test, y_xgb_test = xgb.build_seq(d_xgb_test)
df_xgb_test = xgb.seq_to_df(x_xgb_test, y_xgb_test)

show_summary(df_xgb_train, xgb.tgt_col)
df_xgb_test

In [None]:
# Fit XGBoost model
xgb.fit(x_xgb_train, y_xgb_train, x_xgb_test, y_xgb_test)

In [None]:
# Save model
xgb.save()

# Plot training history
xgb.plot_history_eval()

In [None]:
# Predict
y_xgb_hat = xgb.predict(x_xgb_test)
# Set results
xgb.set_results(y_xgb_hat, y_xgb_test)

# Prediction results
xgb.results

In [None]:
# Get the last sequence prediction for each customer
results = (
    xgb.results
    .sort_values(['id', 'tfs'])
    .groupby('id')
    .last()
    .drop(columns=['tfs'])
    .reset_index()
)

show_summary(results, 'true')
results

In [None]:
# Instantiate Ensemble model in order to compute and plot the scores
model = ChurnEnsemble(
    min_tte=min_tte,
    seed=SEED,
    verbose=1,
    path='../files'
)
# Set the XGBoost model
model.xgb = xgb

model

In [None]:
# Compute and set scores
model.set_scores(results['pred'], results['true'])

# Plot scores summary
model.plot_scores()

In [None]:
# Plot the histogram of the predicted probabilities for each customer sequence.
model.plot_histogram(xgb.results, loc=-1)