In [None]:
import os
import pickle
from pathlib import Path
from datetime import date

import numpy as np
import pandas as pd


from fake_data_for_learning import BayesianNodeRV, FakeDataBayesianNetwork, SampleValue
from fake_data_for_learning.utils import generate_random_cpt, make_cpt

In [None]:
datadir = Path(os.environ['DATA_DIR'])

## Bayesian network

In [None]:
age = BayesianNodeRV('age', np.array([0.2, 0.5, 0.3]), values=('20-34', '35-49', '50-64'))

profession = BayesianNodeRV(
    'profession', 
    np.array([
        [0.1, 0.2, 0.4, 0.3],
        [0.5, 0.3, 0.15, 0.05],
        [0.6, 0.2,  0.05, 0.15]
    ]),
    values=('salaried', 'self-employed', 'student', 'unemployed'),
    parent_names=['age'])

gender = BayesianNodeRV(
    'gender', 
    np.array([0.55, 0.45]), 
    values=('female', 'male')
)

In [None]:
churn_pre_cpt = np.array([
    [
        [[-1.42045768, 0.57954232], # female, 20-34, unemployed
         [-0.40814291, 0.59185709],  # female, 20-34, student
         [-0.56219023, 0.43780977],  # female, 20-34, self-employed
         [0.48849254, -1.1150746]], # female, 20, salaried

        [[-0.46861885, 2.53138115], # female, 35-49, unemployed
         [-0.43218211, 0.56781789], # ...
         [0.60132273, -0.39867727],
         [0.39588113, -0.60411887]],

        [[-0.48908418, 2.51091582],
         [0.46685323, -0.53314677],
         [1.50957938, -0.49042062],
         [0.40211519, -0.59788481]]],


    [
        [[-0.54623047, 0.45376953],# male, 20-34, unemployed
         [-0.52350668, 0.47649332],
         [-0.44970756, 0.55029244],
         [-0.51186244, 0.48813756]],

        [[-0.46305065, 0.53694935],
         [-0.33640207, 0.66359793],
         [-0.51803343, 0.48196657],
         [0.55415768, -0.44584232]],

        [[-1.55106954, 0.44893046],
         [0.50675317, -0.49324683],
         [0.59636579, -0.40363421],
         [0.52955825, -1.247044175]]]])
churn_cpt = make_cpt(churn_pre_cpt)

In [None]:
churn = BayesianNodeRV(
    'churn',
    churn_cpt,
    parent_names = ['gender', 'age', 'profession']
)
X = FakeDataBayesianNetwork(gender, age, profession, churn)

In [None]:
start_year = 2008
end_year = 2018
samples_per_year = np.random.choice(
    range(100, 150), size=end_year-start_year
)

# Initialize
samples = []
for n_samples, year in zip(
    samples_per_year, range(start_year, end_year)
):
    res = X.rvs(n_samples)
    res['year'] = year
    samples.append(res)
    
sample_df = pd.concat(samples, axis=0).reset_index(drop=True)
sample_df.to_csv(datadir.joinpath('churn_simple.csv'), index=False)
# Look at gender breakdown over years
sample_df.groupby(['year', 'gender']).size()

## Bayesian network with hidden nodes

In [None]:
# Age, profession and gender as before
# Hidden nodes patience and thriftiness
patience = BayesianNodeRV(
    'patience',
    np.array([
        [
            [0.6, 0.4], # female, 20-34
            [0.7, 0.3], # female, 35-49
            [0.3, 0.7]  # female, 50-64
        ],
        [
            [0.3, 0.7], # male, 20-34
            [0.4, 0.6], # male, 35-49
            [0.2, 0.8]  # male, 50-64
        ]
    ]),
    parent_names=['gender', 'age']
)

thriftiness = BayesianNodeRV(
    'thriftiness',
    np.array([
        [
            [0.3, 0.7], #20-34, unemployed
            [0.2, 0.8], #20-34, student
            [0.1, 0.9], #20-34, self-employed
            [0.6, 0.4], #20-34, salaried
        ],
        [
            [0.4, 0.6], #35-49, unemployed
            [0.7, 0.3], #35-49, student
            [0.3, 0.7], # 35-49, self-employed
            [0.2, 0.8], # 35-49 salaried
        ],
        [
            [0.1, 0.9], #50-64, unemployed
            [0.2, 0.8], #50-64, student
            [0.3, 0.7], #50-64, self-employed
            [0.25, 0.75], #50-64, salaried
        ],
    ]),
    parent_names=['age', 'profession']
)

## Action

Depending on whether the organization records agent actions or not, the action taken to manage churn (e.g. phone call, email or else) may be hidden or not

In [None]:
# 20% of customers are contacted
action = BayesianNodeRV('action', np.array([0.8, 0.2]))

In [None]:
churn = BayesianNodeRV(
    'churn',
    np.array([
        [
            [
                [0.5, 0.5], # no-action, impatient, not thrifty
                [0.05, 0.95], # no-action, impatient, thrifty
            ],
            [
                [0.95, 0.05], # no-action, patient, not thrifty
                [0.7, 0.3], # no-action, patient, thrifty
            ],
        ],
        [
            [
                [0.2, 0.8], # action, impatient, not thrifty
                [0.01, 0.99] # action, impatient, thrifty
            ],
             [
                [0.95, 0.05], # action, patient, not thrifty
                [0.9, 0.1] # action, patient, thrifty
            ],
        ]
    ]),
    parent_names = ['action', 'patience', 'thriftiness']
)

## Simulate

In [None]:
X = FakeDataBayesianNetwork(gender, age, profession, action, patience, thriftiness, churn)

start_year = 2008
end_year = 2018
samples_per_year = np.random.choice(
    range(100, 150), size=end_year-start_year
)

# Initialize
samples = []
for n_samples, year in zip(
    samples_per_year, range(start_year, end_year)
):
    res = X.rvs(n_samples)
    res['year'] = year
    samples.append(res)
    
sample_df = pd.concat(samples, axis=0).reset_index(drop=True)
sample_df.to_csv(datadir.joinpath('churn.csv'), index=False)
# Look at gender breakdown over years
sample_df.groupby(['year', 'gender']).size()

In [None]:
# Save bayesian network
with open(datadir / 'churn_bn.pickle', 'wb') as f:
    pickle.dump(X, f)