In [None]:
import os

from datetime import date
import numpy as np
import pandas as pd

import math

from fake_data_for_learning import BayesianNodeRV, FakeDataBayesianNetwork, SampleValue

In [None]:
#To parametrize probability tables
def sigmoid(a):
    return 1 / (1 + math.exp(-a))
def inv_sigmoid(p):
    return math.log(p) - math.log(1-p)

def gender_param(a=0.):
    '''Bernoulli random variable for gender parametrized by -inf < a < inf'''
    return BayesianNodeRV(
        'gender', 
        np.array([1-sigmoid(a), sigmoid(a)]), 
        values=('female', 'male')
    )

In [None]:
datadir = os.environ['DATA_DIR']

## Define Bayesian network

In [None]:


age = BayesianNodeRV('age', np.array([0.2, 0.5, 0.3]), values=('20', '40', '60'))

profession = BayesianNodeRV(
    'profession', 
    np.array([
        [0.3, 0.4, 0.2, 0.1],
        [0.05, 0.15, 0.3, 0.5],
        [0.15, 0.05, 0.2, 0.6]
    ]),
    values=('unemployed', 'student', 'self-employed', 'salaried'),
    parent_names=['age'])

patience = BayesianNodeRV(
    'patience',
    np.array([
        [
            [0.6, 0.4], # female, 20
            [0.7, 0.3], # female, 40
            [0.3, 0.7]  # female, 60
        ],
        [
            [0.3, 0.7], # male, 20
            [0.4, 0.6], # male, 40
            [0.2, 0.8]  # male, 60
        ]
    ]),
    parent_names=['gender', 'age']
)

thriftiness = BayesianNodeRV(
    'thriftiness',
    np.array([
        [
            [0.3, 0.7], #20, unemployed
            [0.2, 0.8], #20, student
            [0.1, 0.9], #20, self-employed
            [0.6, 0.4], #20, salaried
        ],
        [
            [0.4, 0.6], #40, unemployed
            [0.7, 0.3], #40, student
            [0.3, 0.7], # 40, self-employed
            [0.2, 0.8], # 40 salaried
        ],
        [
            [0.1, 0.9], #60, unemployed
            [0.2, 0.8], #60, student
            [0.3, 0.7], #60, self-employed
            [0.25, 0.75], #60, salaried
        ],
    ]),
    parent_names=['age', 'profession']
)

churn = BayesianNodeRV(
    'churn',
    np.array([
        [
            [0.5, 0.5], # impatient, not thrifty
            [0.9, 0.1], # patient, not thrifty
        ],
        [
            [0.05, 0.95], # impatient, thrifty
            [0.4, 0.6] # patient, thrifty
        ]
    ]),
    parent_names = ['patience', 'thriftiness']
)

## Simulate

In [None]:
start_year = 2008
end_year = 2012
samples_per_year = np.random.choice(
    range(100, 150), size=end_year-start_year
)

p0 = 0.4
gender = BayesianNodeRV(
    'gender', 
    np.array([1-p0, p0]), 
    values=('female', 'male')
)

# Initialize
del_p = 0.2
samples = []
for n_samples, year in zip(
    samples_per_year, range(start_year, end_year)
):
    X = FakeDataBayesianNetwork(gender, age, profession, patience, thriftiness, churn)
    res = X.rvs(n_samples)
    res['year'] = year
    samples.append(res)
    # Parametrize for next year
    gender = gender_param(p0 + del_p)
    
sample_df = pd.concat(samples, axis=0).reset_index(drop=True)
sample_df.to_csv('churn.csv', index=False)
sample_df.groupby(['year', 'gender']).size()