In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize 
import matplotlib.pyplot as plt
from scipy import stats 
import numdifftools as ndt

In [2]:
file= "iPhoneData_adoption.xlsx"
data = pd.read_excel(file, sheet_name='iPhoneData',usecols='A:E',skiprows=3)


In [3]:
data.head(13)

Unnamed: 0,subscriber_id,time,adopted,gender,income
0,100000819,0,0,U,H
1,100000819,1,0,U,H
2,100000819,2,0,U,H
3,100000819,3,0,U,H
4,100000819,4,0,U,H
5,100000819,5,0,U,H
6,100000819,6,0,U,H
7,100000819,7,0,U,H
8,100000819,8,0,U,H
9,100000819,9,0,U,H


Since a subscriber has multiple entries, I want to make sure that I group the data by subscriber. This ensures that all the columns are associated with one subscriber and I can calculate any statitics by subscriber. Here n_user will have one entry per user by taking the maximum of each colums (you can try any other combination)

In [4]:
n_user=data.groupby('subscriber_id')['time','adopted','gender','income'].max()

  n_user=data.groupby('subscriber_id')['time','adopted','gender','income'].max()


In [5]:
n_user.head()

Unnamed: 0_level_0,time,adopted,gender,income
subscriber_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100000819,11,0,U,H
100001061,11,0,F,VH
100002358,11,0,M,VH
100002569,11,0,U,H
100003998,11,0,F,VH


We want to ensure that time runs for the whole year hence not 0 to 11 but 1 to 12, So we increase time by 1 unit

In [6]:
n_user.time = n_user.time+1

Explore the data

In [7]:
n_user.adopted.describe()

count    10000.000000
mean         0.075700
std          0.264531
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: adopted, dtype: float64

the mean adoption rate is small. So we do not expect lot of adoption and should see lot of survival

In [8]:
n_user.groupby('gender')['time'].count()

gender
F    1774
M    2154
U    6072
Name: time, dtype: int64

There are more unknowns than either M or F entries. When using one-hot encoding, assign these randomly or by something similar like position in the dataframe.

In [10]:
n_user.groupby('income')['time'].count()

income
A     2430
H     3806
L       14
VH    3674
Name: time, dtype: int64

There are very few values of L. To break the income into High and Not High, put everything with value A in one group and everything else in another group.

convert the gender and income into dummy variables


In [11]:
np.random.seed(20241109) # This ensures that we get the same results every time the notebook is run.
n_user['genderM'] = 1
n_user.loc[n_user.gender=='F', 'genderM'] = 0

num_U = sum(n_user.gender=='U')
n_user.loc[n_user.gender=='U', 'genderM'] = np.round(np.random.rand(num_U))

In [12]:
n_user['IncomeHL']=np.where((n_user.income=="A"),0,1)

See below what the data frame will look like

In [13]:
n_user.head(10)

Unnamed: 0_level_0,time,adopted,gender,income,genderM,IncomeHL
subscriber_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100000819,12,0,U,H,0.0,1
100001061,12,0,F,VH,0.0,1
100002358,12,0,M,VH,1.0,1
100002569,12,0,U,H,1.0,1
100003998,12,0,F,VH,0.0,1
100006004,12,0,F,VH,0.0,1
100006157,12,0,F,A,0.0,0
100006255,12,0,M,VH,1.0,1
100006424,12,0,F,A,0.0,0
100009205,12,0,U,VH,0.0,1


To introduce the covariates, we shift the hazard appropriately. Since we take exponential, it is easy to calculate F(t) which is simply $ 1-e^{-\lambda t e^{\beta X)}}$

Again we are doing f(t) = F(t)-F(t-1) to get the probabilty. 
As before we have to account for censored data. Some user may never adopt. For those who adopt, we have F(t)-F(t-1); for those who have not adopted till t =12, we take their survival (1-F(t)) in likelihood function

In [14]:
def exp_cov(param):
    
    lamda=param[0]
    b_gender = param[1]
    b_income = param[2]
    #create covariate
    Cov = np.exp(b_gender*n_user.genderM + b_income*n_user.IncomeHL)
    new_lambda = Cov * lamda
    Ft = 1-np.exp(-new_lambda*n_user.time)
    Ft1  = 1-np.exp(-new_lambda*(n_user.time-1))

    #For people who adopt we take (Ft-Ft1)
    prob_churn = Ft - Ft1
    L = np.sum(np.log(prob_churn[n_user.adopted==1]))

    #For those who survive we take (1-Ft)
    prob_not_churn = 1 - Ft
    L += np.sum(np.log(prob_not_churn[n_user.adopted==0]))

    # This is a trickier way to do the same calculation.
    #L = -np.log(((Ft-Ft1)**n_user.adopted)*((1-Ft)**(1-n_user.adopted))).sum()

    return -L

In [15]:

#mle_out=minimize(exp_cov,np.array([0.2,0.2,0.2]), method='SLSQP', bounds=[{0,None},{None,5},{None,5}])

mle_out=minimize(exp_cov,np.array([0.2,0.2,0.1]), method='Nelder-mead', bounds=[{0,10000},{-10000,5},{-10000,5}])

mle_out

  result = getattr(ufunc, method)(*inputs, **kwargs)


 final_simplex: (array([[0.00414867, 0.09138913, 0.52588916],
       [0.00414885, 0.09132332, 0.52585802],
       [0.00414841, 0.09136813, 0.52592792],
       [0.00414832, 0.09135077, 0.52598565]]), array([4538.43019403, 4538.43019425, 4538.43019431, 4538.43019442]))
           fun: 4538.430194034126
       message: 'Optimization terminated successfully.'
          nfev: 336
           nit: 185
        status: 0
       success: True
             x: array([0.00414867, 0.09138913, 0.52588916])

Calculate the BIC to compare this model with others.

In [16]:
num_params = 3
pop_size = n_user.shape[0]
BIC = num_params * np.log(pop_size) - 2 * (-mle_out.fun)
BIC

9104.49140918418

As expected, the baseline hazard $\lambda$ is small. The actual hazard is now affected by the covariates. We can calculate it for each observation.

In [17]:
lamda=mle_out.x[0]
b_gender = mle_out.x[1]
b_income = mle_out.x[2]
Cov = np.exp(b_gender*n_user.genderM + b_income*n_user.IncomeHL)
hazard = lamda*Cov
hazard

subscriber_id
100000819    0.007019
100001061    0.007019
100002358    0.007691
100002569    0.007691
100003998    0.007019
               ...   
812301972    0.004149
812310995    0.007691
812310996    0.007019
812321677    0.007691
812336212    0.007691
Length: 10000, dtype: float64

The probability of surviving whole 12 periods is $1 - F(t)$ or \$e^{(-\lambda e^{Xb} T)}$, where $T = 12$.

In [18]:
T = 12
prob_survive_12 = np.exp(-lamda*Cov * T)
prob_survive_12

subscriber_id
100000819    0.919217
100001061    0.919217
100002358    0.911837
100002569    0.911837
100003998    0.919217
               ...   
812301972    0.951435
812310995    0.911837
812310996    0.919217
812321677    0.911837
812336212    0.911837
Length: 10000, dtype: float64

Running a quick simulation shows that the number of people predicted to survive is close to the number who actually do survive.

In [19]:
print(f' The number predicted to survive to T = 12 is: {sum(np.random.rand(n_user.shape[0]) <= prob_survive_12)}')

 The number predicted to survive to T = 12 is: 9286


The number who do survive is the entry in the last row of the following dataframe.

In [20]:
n_user.groupby('time').agg(num_adopted = ('adopted', lambda x: sum(x == 0))).reset_index()

Unnamed: 0,time,num_adopted
0,1,0
1,2,0
2,3,0
3,4,6
4,5,10
5,6,27
6,7,7
7,8,0
8,9,0
9,10,0


We follow the same logic for Weibull

In [21]:
def weib_cov(param):
    c=param[0]
    lamda=param[1]
    b1_gender = param[2]
    b2_income = param[3]
    #create covariate
    Cov = np.exp(b1_gender*n_user.genderM + b2_income*n_user.IncomeHL)
    new_lambda = Cov*lamda
    Ft = 1-np.exp(-new_lambda*(n_user.time)**c)
    Ft1  = 1-np.exp(-new_lambda*(n_user.time-1)**c)

    #For people who adopt we take (Ft-Ft1)
    prob_churn = Ft - Ft1
    L = np.sum(np.log(prob_churn[n_user.adopted==1]))

    #For those who survive we take (1-Ft)
    prob_not_churn = 1 - Ft
    L += np.sum(np.log(prob_not_churn[n_user.adopted==0]))

    # This is a trickier way to do the same calculation.
    #L = -np.log(((Ft-Ft1)**n_user.adopted)*((1-Ft)**(1-n_user.adopted))).sum()

    return -L

In [23]:

#mle_out=minimize(weib_cov,np.array([0.1,0.1,0.1,0.1]), method='Nelder-mead', bounds=[{0,None},{0,None},{0,1},{0,1}])
mle_out=minimize(weib_cov,np.array([0.1,0.1,0.1,0.1]), method='SLSQP', bounds=[{0,None},{0,None},{0,1},{0,1}])

mle_out

  result = getattr(ufunc, method)(*inputs, **kwargs)


     fun: 4477.19497244464
     jac: array([-0.03942871, -0.71331787, -0.0065918 , -0.00543213])
 message: 'Optimization terminated successfully'
    nfev: 157
     nit: 27
    njev: 27
  status: 0
 success: True
       x: array([0.65911867, 0.00962915, 0.09068985, 0.52227227])

Calculate the BIC to compare this model with others.

In [24]:
num_params = 4
pop_size = n_user.shape[0]
BIC = num_params * np.log(pop_size) - 2 * (-mle_out.fun)
BIC

8991.231306377185

As expected, the baseline hazard $\lambda$ is small. The actual hazard is now affected by the covariates. We can calculate it for each observation.

In [25]:
c = mle_out.x[0]
lamda=mle_out.x[1]
b_gender = mle_out.x[2]
b_income = mle_out.x[3]
Cov = np.exp(b_gender*n_user.genderM + b_income*n_user.IncomeHL)
hazard = lamda*Cov
hazard

subscriber_id
100000819    0.016233
100001061    0.016233
100002358    0.017774
100002569    0.017774
100003998    0.016233
               ...   
812301972    0.009629
812310995    0.017774
812310996    0.016233
812321677    0.017774
812336212    0.017774
Length: 10000, dtype: float64

The probability of surviving whole 12 periods is $1 - F(t)$ or \$e^{(-\lambda e^{Xb} T^c)}$, where $T = 12$.

In [26]:
T = 12
prob_survive_12 = np.exp(-lamda*Cov * (T**c))
prob_survive_12

subscriber_id
100000819    0.919886
100001061    0.919886
100002358    0.912623
100002569    0.912623
100003998    0.919886
               ...   
812301972    0.951674
812310995    0.912623
812310996    0.919886
812321677    0.912623
812336212    0.912623
Length: 10000, dtype: float64

Running a quick simulation shows that the number of people predicted to survive is close to the number who actually do survive.

In [27]:
print(f' The number predicted to survive to T = 12 is: {sum(np.random.rand(n_user.shape[0]) <= prob_survive_12)}')

 The number predicted to survive to T = 12 is: 9185


The number who do survive is the entry in the last row of the following dataframe.

In [28]:
n_user.groupby('time').agg(num_adopted = ('adopted', lambda x: sum(x == 0))).reset_index()

Unnamed: 0,time,num_adopted
0,1,0
1,2,0
2,3,0
3,4,6
4,5,10
5,6,27
6,7,7
7,8,0
8,9,0
9,10,0
