In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import random
pio.templates.default = 'simple_white'
import datetime
import tqdm
from dateutil.relativedelta import relativedelta

In [2]:
! ls

full_data.csv                       retention_curves.ipynb
generate_daily_synthetic_data.ipynb weekly_data.csv
raw_growth_model.ipynb


### Modelling retention

In [4]:
def get_retention(a, b, c, d, periods):
    return  a + 1./(b + c*periods ** d)

def get_retention_same_event(a, c, d, periods):
    b = 1./(1 - a)
    return get_retention(a, b, c, d, periods)

In [5]:
import plotly
colormap = plotly.colors.qualitative.Pastel

In [35]:
sample_df = pd.DataFrame()
sample_df['periods'] = range(24)

product_coefs = [
    (0, 0.55, 2),
    (0.02, 0.3, 1.6),
    (0.05, 1, 0.9)
]

for i in range(len(product_coefs)):
    sample_df['product' + str(i + 1)] = sample_df.periods.map(
        lambda x: get_retention_same_event(product_coefs[i][0], product_coefs[i][1],
                                           product_coefs[i][2], x)
    )
    
sample_df = sample_df.set_index('periods')
px.line(sample_df.applymap(lambda x: None if x < 0.01 else 100*x).loc[0:],
       title = 'Monthly retention',
       labels = {'value': 'retention, %',
                'periods': '# month',
                'variable': 'product'})

In [36]:
start_month = datetime.date(2023, 9, 1)
num_periods = 36
end_month = start_month + relativedelta(months = num_periods)

start_month, end_month

(datetime.date(2023, 9, 1), datetime.date(2026, 9, 1))

In [37]:
new_users_df = pd.DataFrame()
new_users_df['date'] = pd.date_range(start_month.strftime('%Y-%m-%d'), 
                                     end_month.strftime('%Y-%m-%d'), 
                                     freq = 'M')
new_users_df['date'] = new_users_df.date.map(
    lambda x: x - datetime.timedelta(days = x.day - 1)
)
new_users_df['new_users'] = 1000

In [38]:
new_users_df.head().to_dict('records')

[{'date': Timestamp('2023-09-01 00:00:00'), 'new_users': 1000},
 {'date': Timestamp('2023-10-01 00:00:00'), 'new_users': 1000},
 {'date': Timestamp('2023-11-01 00:00:00'), 'new_users': 1000},
 {'date': Timestamp('2023-12-01 00:00:00'), 'new_users': 1000},
 {'date': Timestamp('2024-01-01 00:00:00'), 'new_users': 1000}]

In [39]:
items = []

for cohort_item in new_users_df.to_dict('records'):
    cohort_date = cohort_item['date'].date()
    cohort_size = cohort_item['new_users']
    
    for p in range(len(product_coefs)):
        product = 'product' + str(p+1)
    
        for m in range(num_periods + 1):
            items.append(
                {
                    'cohort_date': cohort_date,
                    'cohort_size': cohort_size,
                    'date': cohort_date + relativedelta(months = m),
                    'retention_rate': get_retention_same_event(product_coefs[p][0], product_coefs[p][1],
                                                           product_coefs[p][2], m),
                    'product': product
                }
            )

In [40]:
raw_ret_df = pd.DataFrame(items)
raw_ret_df['users'] = raw_ret_df.cohort_size * raw_ret_df.retention_rate

In [41]:
raw_ret_df = raw_ret_df[raw_ret_df.date < end_month]

In [44]:
px.line(
    raw_ret_df.pivot_table(index = 'date',
                      columns = 'product',
                      values = 'users', aggfunc = 'sum'),
    title = 'Active users',
       labels = {'value': 'active users',
                'periods': 'month',
                'variable': 'product'})

In [389]:
px.area(
    raw_ret_df[raw_ret_df['product'] == 'product1'].pivot_table(index = 'date',
                                                           columns = 'cohort_date', 
                                                           values = 'users'),
    labels = {'cohort_date': 'cohort', 'value': 'active users'},
    title = 'Active users by cohorts for product1',
    color_discrete_sequence = px.colors.qualitative.Prism
)

In [390]:
px.area(
    raw_ret_df[raw_ret_df['product'] == 'product3'].pivot_table(index = 'date',
                                                           columns = 'cohort_date', 
                                                           values = 'users'),
    labels = {'cohort_date': 'cohort', 'value': 'active users'},
    title = 'Active users by cohorts for product3',
    color_discrete_sequence = px.colors.qualitative.Prism
)

### Revenue Retention

In [188]:
rev_ret_df = pd.DataFrame()
rev_ret_df['num_month'] = range(13)

cohort_size = 1000

In [189]:
rev_ret_df['random_num1'] = list(map(
    lambda x:  (random.random() - 0.5)*2*0.1,
    rev_ret_df.num_month
))

rev_ret_df['random_num2'] = list(map(
    lambda x:  (random.random() - 0.5)*2*0.1,
    rev_ret_df.num_month
))

In [190]:
rev_ret_df['retention1'] = rev_ret_df.num_month.map(
        lambda x: get_retention_same_event(product_coefs[2][0], product_coefs[2][1],
                                           product_coefs[2][2], x)
    )

rev_ret_df['retention2'] = rev_ret_df['retention1']
rev_ret_df.retention2.loc[0] = rev_ret_df.loc[0].retention2 *1.42



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [191]:
rev_ret_df['cohort1_revenue'] = (1 + rev_ret_df['random_num1']) * 600*cohort_size*rev_ret_df.retention1
rev_ret_df['cohort2_revenue'] = (1 + rev_ret_df['random_num2']) * 600*cohort_size*rev_ret_df.retention2

In [192]:
rev_ret_df.set_index('num_month', inplace = True)

In [193]:
px.line(rev_ret_df[['cohort1_revenue', 'cohort2_revenue']].rename(
    columns = {'cohort1_revenue': 'cohort1', 'cohort2_revenue': 'cohort2'}),
    title = 'Revenue from cohorts',
    labels = {
        'value': 'revenue in USD',
        'num_month': '# month',
        'variable': 'cohort'
    }
)

In [194]:
px.line(rev_ret_df.apply(lambda x: x/rev_ret_df.iloc[0], axis = 1)[['cohort1_revenue', 'cohort2_revenue']].rename(
    columns = {'cohort1_revenue': 'cohort1', 'cohort2_revenue': 'cohort2'}),
    title = 'Revenue Retention normed on 1st month',
    labels = {
        'value': 'retention, %',
        'num_month': '# month',
        'variable': 'cohort'
    }
)

In [197]:
px.line(rev_ret_df.apply(lambda x: x/cohort_size)[['cohort1_revenue', 'cohort2_revenue']].rename(
    columns = {'cohort1_revenue': 'cohort1', 'cohort2_revenue': 'cohort2'}),
    title = 'Revenue Retention normed on cohort size',
    labels = {
        'value': 'revenue in USD',
        'num_month': '# month',
        'variable': 'cohort'
    }
)

### Predicted retention

In [253]:
import numpy as np
import scipy.optimize

In [254]:
fact_df = pd.DataFrame([
    {
        'num_month': 0,
        'fact': 1
    },
    {
        'num_month': 1,
        'fact': 0.5432
    },
    {
        'num_month': 2,
        'fact': 0.3874
    },
    {
        'num_month': 3,
        'fact': 0.3450
    },
    {
        'num_month': 4,
        'fact': 0.3208
    },
    {
        'num_month': 5,
        'fact': 0.3085
    },
    {
        'num_month': 6,
        'fact': 0.2804
    },
    {
        'num_month': 7,
        'fact': 0.2643
    }
])

In [255]:
fact_df.set_index('num_month').head()

Unnamed: 0_level_0,fact
num_month,Unnamed: 1_level_1
0,1.0
1,0.5432
2,0.3874
3,0.345
4,0.3208


In [256]:
def get_mse_for_retention(params):
    tmp_df = fact_df.copy()
    tmp_df['prediction'] = tmp_df.index.map(
        lambda x: get_retention_same_event(params[0], params[1], params[2], x)
    )
    
    tmp_df['se'] = (tmp_df.fact - tmp_df.prediction)
    tmp_df['se'] = tmp_df['se']**2
    
    return tmp_df.se.mean() ** 0.5

In [257]:
get_mse_for_retention([0, 1, 2])

0.2128505330287813

In [258]:
result = scipy.optimize.minimize(get_mse_for_retention, [0, 1, 2])

In [259]:
result

      fun: 0.008183107374475135
 hess_inv: array([[ 0.31646083,  2.3298383 ,  2.63636092],
       [ 2.3298383 , 23.07012695, 16.84567771],
       [ 2.63636092, 16.84567771, 24.74404368]])
      jac: array([-6.95162453e-06, -5.59922773e-06,  8.23126175e-06])
  message: 'Optimization terminated successfully.'
     nfev: 100
      nit: 20
     njev: 25
   status: 0
  success: True
        x: array([0.23635088, 1.98512186, 1.28231184])

In [269]:
ext_fact_df = pd.concat([fact_df, pd.DataFrame({'num_month': range(8, 15)})])

In [270]:
ext_fact_df['prediction'] = list(map(
    lambda x: get_retention_same_event(result.x[0], result.x[1], result.x[2], x),
    ext_fact_df.num_month
))

In [271]:
px.line(ext_fact_df.set_index('num_month'))

In [278]:
import plotly.graph_objects as go
import plotly
fig = go.Figure()

fig.add_trace(go.Scatter(x=ext_fact_df.num_month, y=ext_fact_df.fact, name='fact',
                         line=dict(color=plotly.colors.qualitative.D3[0], width=3)))

fig.add_trace(go.Scatter(x=ext_fact_df.num_month, y=ext_fact_df.prediction, name='prediction',
                         line=dict(color=plotly.colors.qualitative.D3[0], width=3, dash='dot')))

fig.update_layout(title='Daily retention model',
                   yaxis_title='retention',
                   xaxis_title='# day')

### Cohort monitoring

In [368]:
raw_df = pd.read_csv('full_data.csv', sep = '\t')\
    .drop('Unnamed: 0', axis = 1)
raw_df.head()

Unnamed: 0,user_id,date,platform,week_date
0,1000001,2021-01-04,web,2021-01-04
1,1000001,2021-01-05,web,2021-01-04
2,1000001,2021-01-07,web,2021-01-04
3,1000001,2021-01-19,web,2021-01-18
4,1000001,2021-03-08,web,2021-03-08


In [369]:
raw_users_df = raw_df.groupby('user_id', as_index = False)[['date', 'week_date']].min()\
    .rename(columns = {'date': 'cohort_date', 'week_date': 'cohort_week_date'})

In [370]:
raw_df = raw_df.merge(raw_users_df)

In [371]:
raw_df['week_num'] = list(map(
    lambda x, y: (datetime.datetime.strptime(x, '%Y-%m-%d') - datetime.datetime.strptime(y, '%Y-%m-%d')).days,
    raw_df.week_date,
    raw_df.cohort_week_date
))

In [372]:
df = raw_df.groupby(['week_date', 'cohort_week_date'], as_index = False)[['user_id']].nunique()\
    .rename(columns = {'user_id': 'users'})

In [373]:
df['week_num'] = list(map(
    lambda x, y: (datetime.datetime.strptime(x, '%Y-%m-%d') - datetime.datetime.strptime(y, '%Y-%m-%d')).days/7,
    df.week_date,
    df.cohort_week_date
))

In [374]:
df = df.merge(df[df.week_num == 0][['cohort_week_date', 'users']].rename(columns = {'users': 'cohort_users'}))

In [375]:
df['retention'] = 100.*df.users/df.cohort_users

In [376]:
filt_df = df[(df.cohort_week_date > '2023-05-08')
            & (df.week_date < '2023-09-01')]

In [377]:
px.line(
    filt_df.pivot(index = 'week_date', columns = 'cohort_week_date',
             values = 'retention'),
    title = 'Weekly retention by cohort',
    labels = {
        'cohort_week_date': 'cohort',
        'week_date': 'week',
        'value': 'retention, %'
    }
)

In [378]:
px.line(
    filt_df.pivot(index = 'week_num', columns = 'cohort_week_date',
             values = 'retention'),
    title = 'Weekly retention by cohort',
    labels = {
        'cohort_week_date': 'cohort',
        'week_num': '# week',
        'value': 'retention, %'
    }
)

In [379]:
filt_df = df[(df.cohort_week_date > '2022-09-01')
            & (df.week_date < '2023-09-01')]
filt_df = filt_df[filt_df.week_num.isin([1, 3, 5, 20])]

filt_df['week_num'] = filt_df['week_num'].map(int)

In [381]:
px.line(filt_df.pivot(index = 'cohort_week_date',
                          columns = 'week_num',
                          values = 'retention'),
       title = 'Monitoring retention',
       labels = {
           'value': 'retention, %',
           'week_num': '# week',
           'cohort_week_date': 'cohort week'
       })