## Retentions

Давайте посчитаем классический retention по недельным когортам.

In [4]:
HOST = 'http://localhost:8123'
import requests
import pandas as pd
import io
import datetime
import seaborn as sns

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 150)

def get_clickhouse_data(query, host = HOST, connection_timeout = 1500):
    query = query 
    r = requests.post(host, params = {'query': query}, timeout = connection_timeout)
    if r.status_code == 200:
        return r.text
    else:
        raise ValueError(r.text)
        
def get_clickhouse_df(query, host = HOST, connection_timeout = 1500):
    data = get_clickhouse_data(query, host, connection_timeout) 
    df = pd.read_csv(io.StringIO(data), sep = '\t')
    return df

In [6]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
import requests
import pandas as pd

init_notebook_mode(connected = True)

def plotly_df(df, title = ''):
    data = []
    
    for column in df.columns:
        trace = go.Scatter(
            x = df.index,
            y = df[column],
            mode = 'lines',
            name = column
        )
        data.append(trace)
    
    layout = dict(title = title)
    fig = dict(data = data, layout = layout)
    
    iplot(fig, show_link = False)

In [8]:
def highlight_vals(val):
    if (val is None) or (val == ''):
        return ''
    p = 0.5
    if val > 90:
        return 'background-color: rgba(229, 0, 20, %f)' % p
    if val > 80:
        return 'background-color: rgba(231, 25, 43, %f)' % p
    if val > 70:
        return 'background-color: rgba(234, 51, 67, %f)' % p
    if val > 60:
        return 'background-color: rgba(236, 76, 90, %f)' % p
    if val > 50:
        return 'background-color: rgba(239, 102, 114, %f)' % p
    if val > 40:
        return 'background-color: rgba(242, 137, 127, %f)' % p
    if val > 30:
        return 'background-color: rgba(244, 153, 161, %f)' % p
    if val > 20:
        return 'background-color: rgba(247, 178, 184, %f)' % p
    if val > 10:
        return 'background-color: rgba(249, 204, 208, %f)' % p
    return 'background-color: rgba(252, 229, 231, %f)' % p

In [18]:
start_date = datetime.datetime(2017, 8, 28)
end_date = datetime.datetime(2017, 10, 1)

In [19]:
q = 'DROP TABLE IF EXISTS retention_users'
get_clickhouse_data(q)

''

Для начала выгрузим в отдельную таблицу для каждого пользователя, в какие недели он появлялся на сервисе.

In [20]:
q = '''
    CREATE TABLE retention_users ENGINE = Log AS
        SELECT DISTINCT 
            ClientID as client_id, 
            toMonday(Date) as date
        FROM visits_all
        WHERE (client_id != 0)
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

get_clickhouse_data(q)

''

In [21]:
q = '''
    SELECT 
        count() as visits, 
        uniq(client_id) as users
    FROM retention_users 
    FORMAT TabSeparatedWithNames'''
get_clickhouse_df(q)

Unnamed: 0,visits,users
0,873866,620334


In [22]:
q = '''
    SELECT 
        client_id,
        min_date, 
        max_date,
        date
    FROM
        (
            SELECT
                client_id,
                min(date) as min_date,
                max(date) as max_date
            FROM retention_users
            GROUP BY client_id
            HAVING (min_date <= '{end_date}') AND (min_date >= '{start_date}')
        )
        ALL INNER JOIN
        (
            SELECT 
                client_id,
                date
            FROM retention_users
        ) 
        USING client_id
    LIMIT 10
    FORMAT TabSeparatedWithNames
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

get_clickhouse_df(q)

Unnamed: 0,client_id,min_date,max_date,date
0,1505921240378544247,2017-09-18,2017-09-18,2017-09-18
1,1503637663587884741,2017-09-25,2017-09-25,2017-09-25
2,1498210196563306890,2017-09-18,2017-09-18,2017-09-18
3,1501591697286644109,2017-09-04,2017-09-11,2017-09-04
4,1501591697286644109,2017-09-04,2017-09-11,2017-09-11
5,1501233998239305767,2017-09-25,2017-09-25,2017-09-25
6,1472573605844899677,2017-09-04,2017-09-04,2017-09-04
7,1492177010574599868,2017-09-18,2017-09-18,2017-09-18
8,1505214617414301078,2017-09-18,2017-09-18,2017-09-18
9,1470876305274807957,2017-09-11,2017-09-11,2017-09-11


### Классический retention

In [23]:


q = '''
SELECT 
    uniq(client_id) as clients,
    min_date, 
    (date - min_date)/7 as week_num
FROM
    (
        SELECT
            client_id,
            min(date) as min_date,
            max(date) as max_date
        FROM retention_users
        GROUP BY client_id
        HAVING (min_date <= '{end_date}') AND (min_date >= '{start_date}')
    )
    ALL INNER JOIN
    (
        SELECT 
            client_id,
            date
        FROM retention_users
    ) 
    USING client_id
GROUP BY
    week_num,
    min_date
FORMAT TabSeparatedWithNames
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

raw_ret_df = get_clickhouse_df(q)

In [24]:
raw_ret_df.head()

Unnamed: 0,clients,min_date,week_num
0,8953,2017-09-11,0
1,205,2017-08-28,4
2,281,2017-08-28,3
3,338,2017-08-28,2
4,587,2017-08-28,1


In [26]:
ret_df = raw_ret_df.pivot_table(
    index = 'min_date', 
    values = 'clients', 
    columns = 'week_num').fillna(0).T

In [27]:
ret_df_norm = ret_df.apply(lambda x: 100*x/ret_df.loc[0], axis = 1).applymap(lambda x: x if x!=0 else None)

In [28]:
plotly_df(ret_df_norm)

In [29]:
ret_df_norm.T.fillna('').style.applymap(highlight_vals)

week_num,0,1,2,3,4
min_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08-28,100,7.05359,4.06152,3.37659,2.46335
2017-09-04,100,7.29065,4.00823,2.96826,
2017-09-11,100,7.81861,3.95398,,
2017-09-18,100,6.42177,,,
2017-09-25,100,,,,


### Rolling retention

In [30]:
q = '''
SELECT
    uniq(client_id) as clients,
    min_date,
    week_num
FROM
    (SELECT 
        client_id,
        min_date, 
        arrayJoin(range(toUInt64((max_date - min_date)/7) + 1)) as week_num
    FROM
        (
            SELECT
                client_id,
                min(date) as min_date,
                max(date) as max_date
            FROM retention_users
            GROUP BY client_id
            HAVING (min_date <= '{end_date}') AND (min_date >= '{start_date}')
        ))
GROUP BY
    min_date,
    week_num
FORMAT TabSeparatedWithNames
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

raw_roll_ret_df = get_clickhouse_df(q)

In [31]:
roll_ret_df = raw_roll_ret_df.pivot_table(index = 'min_date', 
                                          values = 'clients', 
                                          columns = 'week_num').fillna(0).T

In [32]:
roll_ret_df_norm = roll_ret_df\
    .apply(lambda x: 100*x/roll_ret_df.loc[0], axis = 1)\
    .applymap(lambda x: x if x!=0 else None)

In [33]:
plotly_df(roll_ret_df_norm)

In [34]:
roll_ret_df_norm.T.fillna('').style.applymap(highlight_vals)

week_num,0,1,2,3,4
min_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08-28,100,12.7974,7.95482,5.19106,2.46335
2017-09-04,100,11.2122,6.02318,2.96826,
2017-09-11,100,10.2535,3.95398,,
2017-09-18,100,6.42177,,,
2017-09-25,100,,,,
