In [6]:
# Import to connect to SSH
# Import to run LTV code and import data from postgresql
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import calendar
from tqdm import tqdm
from scipy.optimize import curve_fit, OptimizeWarning
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import plotly.express as px

# %matplotlib notebook
%matplotlib inline

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100


def get_ut(year, month, day):
    return  calendar.timegm(datetime(year, month, day).timetuple())

def func(x, a, b, c):
    return 50*np.tanh(a)*np.log(x + 500*np.tanh(b)) + 100*np.tanh(c)
#     return a*np.log(x + 100*np.tanh(b)) + c
#     return a*np.exp(-0.001*np.tanh(b)/x) + c

In [7]:
%%time
df = pd.read_csv("test.csv",sep='\t')

df = df.rename(columns={"Date" : 'PAY_TIME',
                   'Cohort' : 'LOGIN_FIRST',
                  'SumRevenue_clean' : 'PAY_AMOUNT',
                       'BinDeviceID' : 'USER_ID'})

df = df[df['_ms_common']!='органика']
df = df[df['PlatformType']=='ios']
df['LOGIN_FIRST'] = pd.to_datetime(df['LOGIN_FIRST'], format='%d.%m.%Y %H:%M')
df['PAY_TIME'] = pd.to_datetime(df['PAY_TIME'], format='%d.%m.%Y %H:%M')

df['LOGIN_FIRST_TS'] = pd.to_timedelta(df['LOGIN_FIRST'], unit='ns').dt.total_seconds().astype(int)
df['PAY_TIME_TS'] = pd.to_timedelta(df['PAY_TIME'], unit='ns').dt.total_seconds().astype(int)
df['PAY_AMOUNT'] = df['PAY_AMOUNT'].apply(lambda x: x.replace(',', '.')).astype(float)

users = df[['USER_ID', 'LOGIN_FIRST_TS', 'LOGIN_FIRST', 'Country']].drop_duplicates().reset_index().drop("index", axis=1)
pays = df[['USER_ID', 'PAY_TIME_TS', 'PAY_TIME', 'PAY_AMOUNT', 'LOGIN_FIRST_TS', 'LOGIN_FIRST']]\
.copy().reset_index().drop("index", axis=1)
users = users[users['Country'].isin(['US', 'UK', 'GB', 'FR', 'AU', 'NZ', 'CA'])]

Wall time: 412 ms


In [45]:
tmp = users.merge(pays)
tmp['DATE_DIFF'] = (tmp['PAY_TIME_TS'] - tmp['LOGIN_FIRST_TS']) // (3600*24) + 1

tmp['MAX_DAYS'] = tmp['PAY_TIME_TS'].max()
tmp['MAX_DAYS'] = (tmp['MAX_DAYS'] - tmp['LOGIN_FIRST_TS']) // (3600*24) + 1

month_arr = tmp[['LOGIN_FIRST']].drop_duplicates().sort_values('LOGIN_FIRST').reset_index(drop=True)
coh_list = list(range(1, len(month_arr) + 1))
month_arr['COH_NUMBER'] = coh_list
tmp = tmp.merge(month_arr)

MAX_DAYS = tmp['MAX_DAYS'].max()
MAX_DAYS = list(range(1, MAX_DAYS+1))

In [46]:
lag_list = np.arange(0, tmp['DATE_DIFF'].max(), 1)
lag_list = [int(x) for x in lag_list]

In [47]:
ltv = []
sigma = []
for lag in tqdm(lag_list):
    cnt = []
    pay_sum = []
    for i in coh_list:
        start_date = tmp[tmp['COH_NUMBER']==i]['LOGIN_FIRST'].max()
        start_date = start_date.to_datetime()
        end_date = start_date + timedelta(days=lag)
        cond1 = (tmp['LOGIN_FIRST']<=end_date)&(tmp['LOGIN_FIRST']>=start_date)
        cond2 = (tmp['PAY_TIME']<=end_date)&(tmp['PAY_TIME']>=start_date)
        tmp_tmp = tmp[cond1][cond2]
        a, b = tmp_tmp['PAY_AMOUNT'].sum(), tmp[cond1]['USER_ID'].nunique()
        pay_sum.append(a)
        cnt.append(b)
    ltv.append(sum(pay_sum) / sum(cnt))
    sigma.append(sum(cnt))


to_datetime is deprecated. Use self.to_pydatetime()


Boolean Series key will be reindexed to match DataFrame index.

100%|████████████████████████████████████████████████████████████████████████████████| 236/236 [04:21<00:00,  1.16s/it]


In [48]:
# ltv = [2.394965189873417,4.473349683544303,6.0143655063291135,7.230449367088608,8.363620253164559,9.435501582278484,10.313199367088611,11.20919778481013,11.996272151898737,12.741471518987344,13.386672468354433,14.033030063291143,14.666186708860764,15.243207278481016,15.735707278481016,16.34329430379747,16.878909810126586,17.3104920886076,17.83744462025317,18.23846044303798,18.652492088607598,19.08042721518988,19.498849683544307,19.8916170886076,20.35100949367089,20.699968354430382,21.016333860759495,21.3424003164557,21.680112341772155,21.99005696202532,22.341631329113927,22.698920886075953,22.981393987341775,23.299433606993123,23.62897634106703,23.953468404559093,24.242697339376264,24.49470370012138,24.75932279103047,25.02306380149929,25.24145933578956,25.472353952577883,25.74666855161832,25.974548725412056,26.282020121293062,26.528305409335292,26.70569950935834,26.922100249756305,27.136550039574146,27.318670558442072,27.527046212269937,27.730417640841367,27.914492404935245,28.13902221754172,28.302469986055698,28.45960421091949,28.636831563053256,28.860256003586322,29.03552509817408,29.225692657470177,29.418763007943443,29.547905718151725,29.71349958096761,29.853671994760713,29.99342746921327,30.179772928517117,30.297306911657266,30.410357376464702,30.530327154480215,30.667919079110092,30.879809382468707,31.055294644477442,31.210489578398146,31.36221172382253,31.50048598893623,31.661011644262498,31.76200520709485,31.970373909599775,32.120171542068775,32.25559263935384,32.43224566578181,32.52587615011229,32.685368427000164,32.7864358427305,32.92126333911255,33.02084536354536,33.094020305134144,33.317905315401084,33.41383131775957,33.51049640511521,33.60789069592699,33.71755178879053,33.81497371071245,33.90025641113439,34.05602974446773,34.18608855007407,34.305920657917206,34.437700454652656,34.509424464553646,34.62733934495129,34.70000039364043,34.80853654387517,34.88156137102791,34.968657270081536,35.06488237283209,35.1631334839432,35.248863971359576,35.35521228596632,35.454569614111584,35.548467033255655,35.64220586374186,35.75556883894847,35.82906534244497,35.9033649063497,36.00121828472808,36.11734697742757,36.192130788902986,36.26515444334248,36.34934036952392,36.445225331408885,36.51786858592905,36.5966388135998,36.67811851709433,36.745598916239025,36.83918183081189,36.94304335262942,36.98930810135484,37.083075244735845,37.12343397254649,37.20027766439614,37.238529632900075,37.28483447404985,37.352092316436995,37.40841331836378,37.497972382691266,37.548397747007925,37.615084154003156,37.67485056560203,37.70992994996817,37.76490667516213,37.817465850419865,37.87386885417456,37.90760956363402,37.99795116260794,38.085766860786606,38.12927696623476,38.291700537346856,38.36525544545981,38.421011774573735,38.447296774573736,38.475316102691544,38.506178847789585,38.53971036237096,38.600852337679605,38.656969450490315,38.703283091220385,38.89719841968754,38.92986966600746,38.974029903396186,39.02335791934833,39.06212796473411,39.1099814743984,39.15944309212123,39.1864337220015,39.22826740732357,39.23214878145132,39.26649575362808,39.32932172047891,39.367134397332336,39.376911920268114,39.38155278173494,39.43480159970184,39.46299569252885,39.5304140411527,39.60470099767444,39.685528017876464,39.698802025558926,39.70199787510497,39.72755547879161,39.72848725136686,39.74415618869656,39.75587410221749,39.8026576791138,39.83081498248459,39.85305144972106,39.87012001281369,39.87824001281369,39.89319646927015,39.90642824210208,39.94440174171237,39.96065512816655,39.97922402421812,39.98841908594652,40.00949033276586,40.02158565338977,40.0350852120835,40.0350852120835,40.03667731734666,40.05117295265219,40.057013230429966,40.08432977719256,40.096212631292666,40.11302793183911,40.11302793183911,40.12762608426406,40.150702916415355,40.17787486763487,40.20202486763487,40.23382116393117,40.240450743307015,40.240450743307015,40.25387086165021,40.32410742079,40.34391284343753,40.386797900908796,40.405753758294196,40.42914400219664,40.449486200778196,40.4894237007782,40.4894237007782,40.51763383211028,40.51763383211028,40.51763383211028,40.51763383211028,40.5316037245834,40.5347130995834,40.5347130995834,40.5347130995834,40.5347130995834,40.53923582685613,40.57695525131656,40.57695525131656,40.57695525131656,40.61395525131656,40.6225012635865,40.63259546648505,40.63259546648505,40.63259546648505,40.63259546648505,40.63259546648505,40.63259546648505,40.63259546648505]

In [50]:
sigma

[2097,
 4176,
 6251,
 8317,
 10373,
 12417,
 14455,
 16477,
 18492,
 20505,
 22504,
 24497,
 26481,
 28461,
 30441,
 32413,
 34370,
 36311,
 38239,
 40170,
 42104,
 44029,
 45946,
 47874,
 49801,
 51721,
 53647,
 55566,
 57472,
 59361,
 61239,
 63098,
 64947,
 66781,
 68601,
 70407,
 72201,
 73989,
 75767,
 77536,
 79299,
 81057,
 82796,
 84519,
 86223,
 87922,
 89606,
 91268,
 92920,
 94576,
 96212,
 97846,
 99478,
 101102,
 102718,
 104334,
 105935,
 107529,
 109113,
 110686,
 112240,
 113784,
 115314,
 116832,
 118334,
 119817,
 121281,
 122735,
 124170,
 125588,
 126992,
 128369,
 129734,
 131073,
 132409,
 133724,
 135018,
 136309,
 137588,
 138857,
 140120,
 141368,
 142613,
 143851,
 145085,
 146301,
 147518,
 148720,
 149911,
 151094,
 152264,
 153414,
 154546,
 155673,
 156794,
 157907,
 159007,
 160100,
 161180,
 162252,
 163310,
 164355,
 165397,
 166429,
 167449,
 168455,
 169451,
 170443,
 171433,
 172418,
 173382,
 174328,
 175265,
 176187,
 177100,
 178009,
 178910,
 179

In [53]:
def func(x, a, b, c):
    return 1000*np.tanh(a)*np.log(x + 500*b) + 500*np.tanh(c)
#     return 100*a*np.log(x + 10000*np.tanh(b)) + c
#     return a*np.exp(-0.001*np.tanh(b)/x) + c
#     return 0.001*a*np.sqrt(x + b) + c
#     return 0.001*a*np.exp2(x + b) + c
# lag_list = [x+1 for x in range(len(ltv))]
popt, pcov = curve_fit(func, lag_list, ltv,
              sigma=[x**0.2 for x in sigma], absolute_sigma=True, maxfev = 100000, p0=(0, 1, -1))

days = np.linspace(0, 720, 721)
LTV = [func(x, *popt) for x in days]
days = [int(x) for x in days]

fig = px.line()
fig.add_scatter(x=days, y=LTV, mode='lines', name="Predict")
# Show plot
fig.add_scatter(x=days, y=ltv, mode='lines', name="Real")
fig.update_yaxes(range=[0,max(LTV)*1.1], row=1, col=1)

fig.update_layout(
    title='asdasd',
    xaxis_title="Days Passed",
    yaxis_title="$")
fig.update_layout(hovermode='x')
fig.show()


invalid value encountered in log



In [56]:
LTV

[4.3085450129346725,
 5.13185226262307,
 5.927761250922856,
 6.698036935482904,
 7.444279038100618,
 8.167942047963109,
 8.870352286637655,
 9.55272253797989,
 10.216164648383312,
 10.861700426166365,
 11.49027110838125,
 12.102745615221238,
 12.699927773707302,
 13.282562661344343,
 13.851342195345723,
 14.40691007259558,
 14.949866148802585,
 15.480770331551042,
 16.000146050596072,
 16.50848335932433,
 17.006241713443316,
 17.493852466388304,
 17.97172111541167,
 18.440229327661314,
 18.89973677161275,
 19.350582775872482,
 19.793087834517806,
 20.227554975701324,
 20.654271008158645,
 21.073507658461722,
 21.485522610311122,
 21.890560455822126,
 22.288853567598096,
 22.680622899377468,
 23.066078722161024,
 23.445421301959897,
 23.818841524633584,
 24.18652147269836,
 24.548634958469364,
 24.905348017444254,
 25.256819365434083,
 25.603200822591447,
 25.94463770717185,
 26.281269201583896,
 26.613228693036532,
 26.94064409086934,
 27.2636381224559,
 27.58232860939279,
 27.89682872

In [201]:
# cnt = []
# pay_sum = []
# ltv = []
# for i in coh_list:
#     a = tmp[tmp['COH_NUMBER']==i]['USER_ID'].nunique()
#     b = tmp[tmp['COH_NUMBER']==i][tmp['DATE_DIFF']<=30].sort_values('DATE_DIFF')['PAY_AMOUNT'].sum()
#     cnt.append(a)
#     pay_sum.append(b)
#     ltv.append(b / a)

In [172]:
# from tqdm import tqdm
# import plotly.express as px

# ltv, pay_sum, cnt = [], [], []
# for i in tqdm(MAX_DAYS):
#     #i = MAX_DAYS[1]
#     tmp_tmp = tmp[tmp['DATE_DIFF']<=i][tmp['MAX_DAYS']>=i]
#     tmp_cnt = tmp[tmp['MAX_DAYS']>=i].groupby('COH_NUMBER').agg({'USER_ID': 'nunique'}).reset_index()
#     tmp_tmp_tmp = tmp_tmp.groupby('COH_NUMBER').agg({'PAY_AMOUNT' : "sum"}).reset_index()
#     stat = tmp_cnt.merge(tmp_tmp_tmp, how='left').fillna(0)
#     stat['PAY_AMOUNT'].sum(), stat['USER_ID'].sum(), stat['PAY_AMOUNT'].sum() / stat['USER_ID'].sum()
#     ltv.append(stat['PAY_AMOUNT'].sum() / stat['USER_ID'].sum())
#     pay_sum.append(stat['PAY_AMOUNT'].sum())
#     cnt.append(stat['USER_ID'].sum())



