In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import random
pio.templates.default = 'simple_white'
import datetime
import tqdm

### Modelling retention

In [2]:
def get_retention(a, b, c, d, periods):
    return  a + 1./(b + c*periods ** d)

def get_retention_same_event(a, c, d, periods):
    b = 1./(1 - a)
    return get_retention(a, b, c, d, periods)

In [3]:
sample_df = pd.DataFrame()
sample_df['periods'] = range(30)

In [18]:
sample_df = pd.DataFrame()
sample_df['periods'] = range(24)

product_coefs = [
    (0, 0.55, 2),
    (0.02, 0.3, 1.6),
    (0.04, 0.3, 1.5),
    (0.01, 1.3, 1)
]

for i in range(len(product_coefs)):
    sample_df['product' + str(i + 1)] = sample_df.periods.map(
        lambda x: get_retention_same_event(product_coefs[i][0], product_coefs[i][1],
                                           product_coefs[i][2], x)
    )
    
sample_df = sample_df.set_index('periods')
px.line(sample_df.applymap(lambda x: None if x < 0.01 else 100*x).loc[0:],
       title = 'Monthly retention',
       labels = {'value': 'retention, %',
                'periods': '# month',
                'variable': 'product'})

### Modelling new users

In [10]:
weekly_coefs = {
    0: 1.0, 
    1: 0.9942430174015676,
    2: 0.9820212874783774,
    3: 0.9790313740157027,
    4: 0.9385562774857475,
    5: 0.7855713201801697,
    6: 0.8163537550287501
}

In [11]:
new_users_df1 = pd.DataFrame()
new_users_df1['date'] = pd.date_range('2021-01-01', '2023-12-31')
new_users_df1['x'] = range(new_users_df1.shape[0])

new_users_df1['trend'] = new_users_df1.x.map(
    lambda x: 1/(0.0036 + (x + 1) ** -1.3)
)
new_users_df1.drop('x', axis = 1, inplace = True)

def get_new_users(date, trend):
    return int((weekly_coefs[date.weekday()] + 0.1*random.random())*trend)

new_users_df1['new_users'] = list(map(
    get_new_users,
    new_users_df1.date,
    new_users_df1.trend
))

In [12]:
px.line(new_users_df1.set_index('date'))

In [13]:
new_users_df2 = pd.DataFrame()
new_users_df2['date'] = pd.date_range('2023-02-14', '2023-12-31')
new_users_df2['x'] = range(new_users_df2.shape[0])

new_users_df2['trend'] = new_users_df2.x.map(
    lambda x: 1/(0.0003 + (x + 1) ** -1.25)
)
new_users_df2.drop('x', axis = 1, inplace = True)

new_users_df2['new_users'] = list(map(
    get_new_users,
    new_users_df2.date,
    new_users_df2.trend
))

In [15]:
px.line(new_users_df2.set_index('date'))

In [16]:
new_users_df3 = pd.DataFrame()
new_users_df3['date'] = pd.date_range('2022-02-24', '2023-12-31')
new_users_df3['x'] = range(new_users_df3.shape[0])

new_users_df3['trend'] = new_users_df3.x.map(
    lambda x: 1/(0.0023 + (x + 1) ** -1.8)
)
new_users_df3.drop('x', axis = 1, inplace = True)

new_users_df3['new_users'] = list(map(
    get_new_users,
    new_users_df3.date,
    new_users_df3.trend
))

In [17]:
px.line(new_users_df3.set_index('date'))

### Modelling data

In [50]:
users_lst1 = []
last_id = 1

for rec in new_users_df1.to_dict('records'):
    for _ in range(rec['new_users']):
        users_lst1.append(
            {
                'user_id': last_id,
                'cohort': rec['date']
            }
        )
        last_id += 1
        
user_activity1 = []
for rec in tqdm.tqdm(users_lst1):
    user_id = rec['user_id']
    cohort = rec['cohort']
    for date in pd.date_range(rec['cohort'], '2023-12-31'):
        num_day = (date - rec['cohort']).days
        if cohort < datetime.datetime(2022, 2, 24):
            params = (0.01, 0.3, 1.6)
        elif cohort < datetime.datetime(2022, 12, 18):
            params = (0.02, 0.3, 1.6)
        else:
            params = (0.03, 0.3, 1.4)
        if random.random() <= get_retention_same_event(params[0], params[1], params[2], num_day) \
                * weekly_coefs[date.weekday()] * (1 + (random.random() - 0.5)*2*0.3):
            user_activity1.append(
                {
                    'user_id': user_id,
                    'date': date
                }
            )

act_df1 = pd.DataFrame(user_activity1)

100%|██████████████████████████████████| 252287/252287 [05:46<00:00, 727.55it/s]


In [51]:
px.line(act_df1.groupby('date')[['user_id']].count())

In [52]:
act_df1.shape[0]

3104551

In [54]:
users_lst2 = []
last_id = 1

for rec in new_users_df2.to_dict('records'):
    for _ in range(rec['new_users']):
        users_lst2.append(
            {
                'user_id': last_id,
                'cohort': rec['date']
            }
        )
        last_id += 1
        
user_activity2 = []
for rec in tqdm.tqdm(users_lst2):
    user_id = rec['user_id']
    cohort = rec['cohort']
    for date in pd.date_range(rec['cohort'], '2023-12-31'):
        num_day = (date - rec['cohort']).days
        params = (0.0, 0.55, 1.1)
        if random.random() <= get_retention_same_event(params[0], params[1], params[2], num_day) \
            * weekly_coefs[date.weekday()] * (1 + (random.random() - 0.5)*2*0.3):
            
            user_activity2.append(
                {
                    'user_id': user_id,
                    'date': date
                }
            )

act_df2 = pd.DataFrame(user_activity2)

100%|█████████████████████████████████| 151359/151359 [00:39<00:00, 3851.32it/s]


In [55]:
px.line(act_df2.groupby('date')[['user_id']].count())

In [56]:
act_df2.shape[0]

851837

In [57]:
users_lst3 = []
last_id = 1

for rec in new_users_df3.to_dict('records'):
    for _ in range(rec['new_users']):
        users_lst3.append(
            {
                'user_id': last_id,
                'cohort': rec['date']
            }
        )
        last_id += 1
        
user_activity3 = []
for rec in tqdm.tqdm(users_lst3):
    user_id = rec['user_id']
    cohort = rec['cohort']
    for date in pd.date_range(rec['cohort'], '2023-12-31'):
        num_day = (date - rec['cohort']).days
        params = (0.01, 1.3, 1)
        if random.random() <= get_retention_same_event(params[0], params[1], params[2], num_day) \
            * weekly_coefs[date.weekday()] * (1 + (random.random() - 0.5)*2*0.3):
            
            user_activity3.append(
                {
                    'user_id': user_id,
                    'date': date
                }
            )

act_df3 = pd.DataFrame(user_activity3)

100%|█████████████████████████████████| 266868/266868 [03:05<00:00, 1442.40it/s]


In [58]:
px.line(act_df3.groupby('date')[['user_id']].count())

In [59]:
act_df1['user_id'] = act_df1['user_id'] + 10**6
act_df2['user_id'] = act_df1['user_id'] + 10**6*2
act_df3['user_id'] = act_df1['user_id'] + 10**6*3

In [60]:
act_df1['platform'] = 'web'
act_df2['platform'] = 'ios'
act_df3['platform'] = 'android'

In [61]:
act_df1.shape[0], act_df2.shape[0], act_df3.shape[0]

(3104551, 851837, 2010688)

In [62]:
act_df = pd.concat(
    [act_df1, act_df2, act_df3]
)

In [63]:
px.line(act_df.groupby('date')[['user_id']].count())

In [65]:
act_df.to_csv('full_data.csv', sep = '\t')

In [66]:
act_df.shape[0]

5967076

In [67]:
! ls -l -h

total 658312
-rw-r--r--@ 1 mariia.mansurova  staff   181M 26 Aug 12:59 full_data.csv
-rw-r--r--  1 mariia.mansurova  staff   704K 26 Aug 12:57 generate_daily_synthetic_data.ipynb
-rw-r--r--  1 mariia.mansurova  staff   300K 22 Jul 22:44 raw_growth_model.ipynb
-rw-r--r--  1 mariia.mansurova  staff   455K 26 Aug 12:49 retention_curves.ipynb
-rw-r--r--  1 mariia.mansurova  staff   127M 26 Aug 12:43 weekly_data.csv


In [68]:
days_df = act_df[['date']].drop_duplicates()
days_df.shape[0]

1094

In [69]:
def get_week_start(d):
    dd = d #.datetime.strptime(d, '%Y-%m-%d')
    return (dd - datetime.timedelta(days = dd.weekday())).strftime('%Y-%m-%d')

days_df['week_date'] = days_df.date.map(
    get_week_start
)

In [70]:
mrg_act_df = act_df.merge(days_df, how = 'left')

In [71]:
mrg_act_df.shape[0]

5967076

In [72]:
%%time 
mrg_act_df.to_csv('full_data.csv', sep = '\t')

CPU times: user 7.79 s, sys: 184 ms, total: 7.97 s
Wall time: 8.66 s


In [73]:
px.area(act_df.pivot_table(index = 'date', columns = 'platform',
                  values = 'user_id', aggfunc = 'nunique'))

In [74]:
px.area(mrg_act_df.pivot_table(index = 'week_date', columns = 'platform',
                  values = 'user_id', aggfunc = 'nunique'))

In [75]:
cohorts_df = mrg_act_df.groupby('user_id', as_index = False).aggregate({'week_date': 'min'})\
    .rename(columns = {'week_date': 'cohort_week'})

In [76]:
week_act_df = mrg_act_df.merge(cohorts_df, how = 'left')

In [77]:
week_act_df = week_act_df[['user_id', 'platform', 'week_date', 'cohort_week']].drop_duplicates()

In [78]:
week_act_df.shape[0]

4274489

In [79]:
week_act_df.to_csv('weekly_data.csv', index = False)

In [80]:
! ls -l -h

total 856032
-rw-r--r--@ 1 mariia.mansurova  staff   246M 26 Aug 12:59 full_data.csv
-rw-r--r--  1 mariia.mansurova  staff   1.2M 26 Aug 12:59 generate_daily_synthetic_data.ipynb
-rw-r--r--  1 mariia.mansurova  staff   300K 22 Jul 22:44 raw_growth_model.ipynb
-rw-r--r--  1 mariia.mansurova  staff   455K 26 Aug 12:49 retention_curves.ipynb
-rw-r--r--  1 mariia.mansurova  staff   144M 26 Aug 13:00 weekly_data.csv
