In [100]:
from datetime import datetime
import numpy as np
import pandas as pd
import calendar
from scipy.optimize import curve_fit, OptimizeWarning
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import plotly.express as px
%matplotlib inline

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100


def get_ut(year, month, day):
    return  calendar.timegm(datetime(year, month, day).timetuple())

COH_SIZE = 1# В днях

def func(x, a, b, c):
#     return 50*np.tanh(a)*np.log(x + 10*np.tanh(b)) + 50*np.tanh(c)
#     return 50*np.tanh(a)*np.log(x + 500*np.tanh(b)) + 100*np.tanh(c)
    return a*np.log(x + 100*np.tanh(b)) + c
#     return a*np.exp(-0.001*np.tanh(b)/x) + c




def plot_ltv(users, pays):
    real, predict, sigma = predict_LTV(users, pays)
    days = list(range(len(predict)))
    import plotly.express as px
    fig = px.line()
    fig.add_scatter(x=days, y=real, mode='lines', name="Real")
    # Show plot
    fig.add_scatter(x=days, y=predict, mode='lines', name="Predict")
    fig.update_yaxes(range=[0,max(predict)*1.1], row=1, col=1)
    
    fig.update_layout(
        title=f'LTV',
        xaxis_title="Days Passed",
        yaxis_title="$")
    fig.update_layout(hovermode='x')
    fig.show()
    stats = pd.DataFrame()
    stats['День'] = list(range(len(predict)))
    stats['Прогнозное_значение'] = predict
    stats['Реальное_значение_LTV'] = real + [None]*(len(predict) - len(real))
    stats['Коэффициент_от_721_дня_прогноз'] = predict[-1] / stats['Прогнозное_значение']
    return stats, real, predict, sigma

def get_cohs(profiles, payments):
    profiles['COH'] = (profiles['REG_DATE_TS'] - profiles['REG_DATE_TS'].min()) // (COH_SIZE*3600*24)
    profiles = profiles.loc[profiles['COH'] >= 0]
    
    cohs_sizes = profiles.groupby('COH').agg({'USER_ID': 'nunique', 
                                        'REG_DATE_TS': 'max'}
                                      ).reset_index().rename(index=str, columns={'USER_ID': 'COH_SIZE'})
    cohs_sizes['LAST_DAY'] = max(payments['PAY_DATE_TS'].max(), cohs_sizes['REG_DATE_TS'].max())
    cohs_sizes['MAX_DAY'] = (cohs_sizes['LAST_DAY'] - cohs_sizes['REG_DATE_TS']) // (24*3600)
    cohs_sizes = cohs_sizes[['COH', 'COH_SIZE', 'MAX_DAY']]

    zeros_df = pd.concat([pd.DataFrame({'COH': coh, 
                                        'COH_SIZE': coh_size,
                                        'DAYS_PASSED': list(range(int(MAX_DAY)+1))
                      }) for idx, (coh, coh_size, MAX_DAY) in cohs_sizes.iterrows()])
        
    project = pd.merge(profiles, payments, on='USER_ID') 
    project['DAYS_PASSED'] = (project['PAY_DATE_TS'] - project['REG_DATE_TS']) // (24*3600)
    project_sum = project.groupby(['COH', 'DAYS_PASSED'])['PAY_AMOUNT'].sum().reset_index()
    project_sum = pd.merge(zeros_df, project_sum, on=['COH', 'DAYS_PASSED'], how='left').fillna(0)    
    return project, cohs_sizes, project_sum
    
    
def predict_LTV(profiles, payments):
    project, cohs_sizes, project_sum = get_cohs(profiles, payments)    
    #------------------Построение-через-усредненную-кривую-------------------------------------------------------
    data_avg = project_sum.groupby('DAYS_PASSED').agg({'PAY_AMOUNT': 'sum', 
                                                           'COH_SIZE': 'sum'}).reset_index()
    data_avg['ARPU'] = data_avg['PAY_AMOUNT'] / data_avg['COH_SIZE']
    data_avg['LTV'] = data_avg['ARPU'].cumsum() #/ 60
    data_avg['SIGMA'] = 1 / (data_avg['COH_SIZE'])**2
    #------------------Построение-через-усреднение-кривых-всех-когорт--------------------------------------------
    data_mean_all = project_sum.copy()
    data_mean_all['CUM_SUM'] = project_sum.groupby('COH')['PAY_AMOUNT'].cumsum()
    data_mean_all['LTV'] = data_mean_all['CUM_SUM'] / data_mean_all['COH_SIZE']
    data_mean_all['LTV'] = data_mean_all['LTV']
    data_mean_all['SIGMA'] = 1 / (data_mean_all['COH_SIZE'])**0.2
    #Колонка sigma (по сути, это веса) - для учета размеров когорт и их вклада в общее предсказание
    #Т.к. используем веса, не нужно выбрасывать когорты малого размера - эти, возможно, выбросы все равно будут слабо влиять
    
    n_players = int(project_sum[['COH', 'COH_SIZE']].drop_duplicates()['COH_SIZE'].sum())
    
    if (n_players > 0):
        title = 'n_players = %s' % (str(n_players))
        popt, pcov = curve_fit(func, data_avg['DAYS_PASSED'], data_avg['LTV'].astype('float'),
                      sigma=data_avg['SIGMA'], absolute_sigma=True, maxfev = 100000, p0=(1,1,-1))
        
        
        days = np.linspace(0, 720, 721)
        LTV = [func(x, *popt) for x in days]   #Взвешенное среднее всех когорт
    return list(data_avg['LTV'].values), list(LTV), list(data_avg['SIGMA'])

In [101]:
df = pd.read_csv("test_transformed.csv", sep=',', encoding='cp1251',
                 usecols=['REG_DATE_TS',
                                                           'USER_ID',
                                                           'COUNTRY',
                                                           'SOURCE',
                                                           'PAY_AMOUNT', 
                                                           'PAY_DATE_TS']
                )

for column in ['REG_DATE_TS', 'PAY_AMOUNT', 'PAY_DATE_TS']:
    df[column] = df[column].astype(float)
    
# %%time
def func(x, a, b, c):
    return 50*np.tanh(a)*np.log(x + 100000*np.tanh(b)) + 50*np.tanh(c)
#     return 50*np.tanh(a)*np.log(x + 500*np.tanh(b)) + 100*np.tanh(c)
#     return a*np.log(x + 100*np.tanh(b)) + c
#     return a*np.exp(-0.001*np.tanh(b)/x) + c
users = df[['REG_DATE_TS', 'USER_ID', 'COUNTRY', 'SOURCE']].drop_duplicates()
pays = df[['USER_ID', 'PAY_AMOUNT', 'PAY_DATE_TS']]
stats, real, predict, sigma = plot_ltv(users, pays)

# real_new = real.copy()
# predict_new = predict[:len(real)]
# m_max = max(max(predict_new), max(real_new))

# real_new = [x / m_max for x in real_new]
# predict_new = [x / m_max for x in predict_new]

# min_sigma = min(sigma)
# new_sigma = [x/min_sigma for x in sigma]
# error = []
# for i in range(len(new_sigma)):
# #     print(i, abs(predict_new[i] - real_new[i]) * (1/(new_sigma[i] + i)**2))
#     error.append(abs(predict_new[i] - real_new[i]) * (1/(new_sigma[i] + i)**2))
# print(sum(error))

# for i in range(len(real_new)):
#     print(real[i], predict[i], error[i])


invalid value encountered in log



In [118]:
df = pd.read_csv("devtodev_Sql_wizard_27082020_0600_transformed.csv", sep=',', encoding='cp1251',
                 usecols=['REG_DATE_TS',
                                                           'USER_ID',
#                                                            'COUNTRY',
#                                                            'SOURCE',
                                                           'PAY_AMOUNT', 
                                                           'PAY_DATE_TS']
                )

for column in ['REG_DATE_TS', 'PAY_DATE_TS']:
    df[column] = df[column].astype('float64') / 1e3
    
# %%time
def func(x, a, b, c):
    return 50*np.tanh(a)*np.log(x + 100000*np.tanh(b)) + 50*np.tanh(c)
#     return 50*np.tanh(a)*np.log(x + 500*np.tanh(b)) + 100*np.tanh(c)
#     return a*np.log(x + 100*np.tanh(b)) + c
#     return a*np.exp(-0.001*np.tanh(b)/x) + c
users = df[['REG_DATE_TS', 'USER_ID', 
#             'COUNTRY', 'SOURCE'
           ]].drop_duplicates()
pays = df[['USER_ID', 'PAY_AMOUNT', 'PAY_DATE_TS']]
stats, real, predict, sigma = plot_ltv(users, pays)

real_new = real.copy()
predict_new = predict[:len(real)]
m_max = max(max(predict_new), max(real_new))

real_new = [x / m_max for x in real_new]
predict_new = [x / m_max for x in predict_new]

min_sigma = min(sigma)
new_sigma = [x/min_sigma for x in sigma]
error = []
for i in range(len(new_sigma)):
#     print(i, abs(predict_new[i] - real_new[i]) * (1/(new_sigma[i] + i)**2))
    error.append(abs(predict_new[i] - real_new[i]) * (1/(new_sigma[i] + i)**2))
print(sum(error))

for i in range(len(real_new)):
    print(real[i], predict[i], error[i])


invalid value encountered in log



0.0019950977441068373
19.03303536775148 19.026861645731834 0.00022017346911495927
21.278636168736703 21.393554896164865 0.0008225593482667106
22.513029093165525 22.33540341245906 0.000544305681718941
23.00013111460665 22.933975839439455 0.0001212320702152022
23.31920455658196 23.373706738640454 5.518608120234197e-05
23.574169585967923 23.72145875001808 0.00010139209583459557
23.95002137982757 24.009132665171126 3.136944220122037e-05
24.243054713325108 24.254460663379863 4.2594661785924525e-06
24.591896145390326 24.468324664418432 3.4062652577537655e-05
24.65649677310461 24.65788560661335 3.0562078525985467e-07
24.78317351026473 24.828107650161826 7.979854839395721e-06
24.950094596065334 24.98257394280936 5.0326683909893e-06
25.112301436196912 25.123955683994158 1.4475405236574946e-06
25.23405207067967 25.254297535020385 2.124807548807798e-06
25.338235467231396 25.37519943138717 3.4904746958827727e-06
25.547928514431394 25.487936883528384 4.297803245606392e-06
25.743808514431393 25.5935

In [117]:
stats.to_excel("Статистика.xlsx", index=False)

In [116]:
stats

Unnamed: 0,День,Прогнозное_значение,Реальное_значение_LTV,Коэффициент_от_721_дня_прогноз
0,0,19.026862,19.033035,1.677568
1,1,21.393555,21.278636,1.491984
2,2,22.335403,22.513029,1.42907
3,3,22.933976,23.000131,1.391771
4,4,23.373707,23.319205,1.365588
5,5,23.721459,23.57417,1.345569
6,6,24.009133,23.950021,1.329446
7,7,24.254461,24.243055,1.315999
8,8,24.468325,24.591896,1.304497
9,9,24.657886,24.656497,1.294468
